X86ISelLowering.cpp revision 162ee5c725fdfd327243dc11520afb88659f2ed4
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 75/// sets things up to match to an AVX VEXTRACTF128 instruction or a 76/// simple subregister reference. Idx is an index in the 128 bits we 77/// want. It need not be aligned to a 128-bit bounday. That makes 78/// lowering EXTRACT_VECTOR_ELT operations easier. 79static SDValue Extract128BitVector(SDValue Vec, 80 SDValue Idx, 81 SelectionDAG &DAG, 82 DebugLoc dl) { 83 EVT VT = Vec.getValueType(); 84 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 85 EVT ElVT = VT.getVectorElementType(); 86 int Factor = VT.getSizeInBits()/128; 87 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 88 VT.getVectorNumElements()/Factor); 89 90 // Extract from UNDEF is UNDEF. 91 if (Vec.getOpcode() == ISD::UNDEF) 92 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 93 94 if (isa<ConstantSDNode>(Idx)) { 95 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 96 97 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 98 // we can match to VEXTRACTF128. 99 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 100 101 // This is the index of the first element of the 128-bit chunk 102 // we want. 103 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 104 * ElemsPerChunk); 105 106 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 108 VecIdx); 109 110 return Result; 111 } 112 113 return SDValue(); 114} 115 116/// Generate a DAG to put 128-bits into a vector > 128 bits. This 117/// sets things up to match to an AVX VINSERTF128 instruction or a 118/// simple superregister reference. Idx is an index in the 128 bits 119/// we want. It need not be aligned to a 128-bit bounday. That makes 120/// lowering INSERT_VECTOR_ELT operations easier. 121static SDValue Insert128BitVector(SDValue Result, 122 SDValue Vec, 123 SDValue Idx, 124 SelectionDAG &DAG, 125 DebugLoc dl) { 126 if (isa<ConstantSDNode>(Idx)) { 127 EVT VT = Vec.getValueType(); 128 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 129 130 EVT ElVT = VT.getVectorElementType(); 131 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 132 EVT ResultVT = Result.getValueType(); 133 134 // Insert the relevant 128 bits. 135 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 136 137 // This is the index of the first element of the 128-bit chunk 138 // we want. 139 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 140 * ElemsPerChunk); 141 142 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 143 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 144 VecIdx); 145 return Result; 146 } 147 148 return SDValue(); 149} 150 151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 152 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 153 bool is64Bit = Subtarget->is64Bit(); 154 155 if (Subtarget->isTargetEnvMacho()) { 156 if (is64Bit) 157 return new X8664_MachoTargetObjectFile(); 158 return new TargetLoweringObjectFileMachO(); 159 } 160 161 if (Subtarget->isTargetELF()) 162 return new TargetLoweringObjectFileELF(); 163 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 164 return new TargetLoweringObjectFileCOFF(); 165 llvm_unreachable("unknown subtarget type"); 166} 167 168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 169 : TargetLowering(TM, createTLOF(TM)) { 170 Subtarget = &TM.getSubtarget<X86Subtarget>(); 171 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 172 X86ScalarSSEf32 = Subtarget->hasXMM(); 173 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 174 175 RegInfo = TM.getRegisterInfo(); 176 TD = getTargetData(); 177 178 // Set up the TargetLowering object. 179 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 180 181 // X86 is weird, it always uses i8 for shift amounts and setcc results. 182 setBooleanContents(ZeroOrOneBooleanContent); 183 184 // For 64-bit since we have so many registers use the ILP scheduler, for 185 // 32-bit code use the register pressure specific scheduling. 186 if (Subtarget->is64Bit()) 187 setSchedulingPreference(Sched::ILP); 188 else 189 setSchedulingPreference(Sched::RegPressure); 190 setStackPointerRegisterToSaveRestore(X86StackPtr); 191 192 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 193 // Setup Windows compiler runtime calls. 194 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 195 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 196 setLibcallName(RTLIB::SREM_I64, "_allrem"); 197 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 198 setLibcallName(RTLIB::MUL_I64, "_allmul"); 199 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 200 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 201 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 202 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 203 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 207 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 225 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 226 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 257 } else if (!UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!UseSoftFloat) { 315 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 327 if (!X86ScalarSSEf64) { 328 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 329 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 330 if (Subtarget->is64Bit()) { 331 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 332 // Without SSE, i64->f64 goes through memory. 333 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 334 } 335 } 336 337 // Scalar integer divide and remainder are lowered to use operations that 338 // produce two results, to match the available instructions. This exposes 339 // the two-result form to trivial CSE, which is able to combine x/y and x%y 340 // into a single instruction. 341 // 342 // Scalar integer multiply-high is also lowered to use two-result 343 // operations, to match the available instructions. However, plain multiply 344 // (low) operations are left as Legal, as there are single-result 345 // instructions for this in x86. Using the two-result multiply instructions 346 // when both high and low results are needed must be arranged by dagcombine. 347 for (unsigned i = 0, e = 4; i != e; ++i) { 348 MVT VT = IntVTs[i]; 349 setOperationAction(ISD::MULHS, VT, Expand); 350 setOperationAction(ISD::MULHU, VT, Expand); 351 setOperationAction(ISD::SDIV, VT, Expand); 352 setOperationAction(ISD::UDIV, VT, Expand); 353 setOperationAction(ISD::SREM, VT, Expand); 354 setOperationAction(ISD::UREM, VT, Expand); 355 356 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 357 setOperationAction(ISD::ADDC, VT, Custom); 358 setOperationAction(ISD::ADDE, VT, Custom); 359 setOperationAction(ISD::SUBC, VT, Custom); 360 setOperationAction(ISD::SUBE, VT, Custom); 361 } 362 363 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 364 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 365 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 366 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 367 if (Subtarget->is64Bit()) 368 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 372 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 373 setOperationAction(ISD::FREM , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f64 , Expand); 375 setOperationAction(ISD::FREM , MVT::f80 , Expand); 376 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 377 378 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 379 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 380 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 381 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 382 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 383 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 384 if (Subtarget->is64Bit()) { 385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 386 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 387 } 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 401 402 // These should be promoted to a larger select which is supported. 403 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 404 // X86 wants to expand cmov itself. 405 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 406 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 407 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 408 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 409 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 411 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 412 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 414 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 415 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 419 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 420 } 421 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 422 423 // Darwin ABI issue. 424 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 425 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 426 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 427 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 430 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 431 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 432 if (Subtarget->is64Bit()) { 433 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 434 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 435 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 436 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 437 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 438 } 439 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 440 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 441 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 442 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 443 if (Subtarget->is64Bit()) { 444 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 445 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 446 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 447 } 448 449 if (Subtarget->hasXMM()) 450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 451 452 // We may not have a libcall for MEMBARRIER so we should lower this. 453 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 454 455 // On X86 and X86-64, atomic operations are lowered to locked instructions. 456 // Locked instructions, in turn, have implicit fence semantics (all memory 457 // operations are flushed before issuing the locked instruction, and they 458 // are not buffered), so we can fold away the common pattern of 459 // fence-atomic-fence. 460 setShouldFoldAtomicFences(true); 461 462 // Expand certain atomics 463 for (unsigned i = 0, e = 4; i != e; ++i) { 464 MVT VT = IntVTs[i]; 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 466 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 467 } 468 469 if (!Subtarget->is64Bit()) { 470 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 471 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 472 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 473 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 474 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 475 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 476 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 477 } 478 479 // FIXME - use subtarget debug flags 480 if (!Subtarget->isTargetDarwin() && 481 !Subtarget->isTargetELF() && 482 !Subtarget->isTargetCygMing()) { 483 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 484 } 485 486 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 487 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 488 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 489 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 490 if (Subtarget->is64Bit()) { 491 setExceptionPointerRegister(X86::RAX); 492 setExceptionSelectorRegister(X86::RDX); 493 } else { 494 setExceptionPointerRegister(X86::EAX); 495 setExceptionSelectorRegister(X86::EDX); 496 } 497 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 498 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 499 500 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 501 502 setOperationAction(ISD::TRAP, MVT::Other, Legal); 503 504 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 505 setOperationAction(ISD::VASTART , MVT::Other, Custom); 506 setOperationAction(ISD::VAEND , MVT::Other, Expand); 507 if (Subtarget->is64Bit()) { 508 setOperationAction(ISD::VAARG , MVT::Other, Custom); 509 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 510 } else { 511 setOperationAction(ISD::VAARG , MVT::Other, Expand); 512 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 513 } 514 515 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 516 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 517 setOperationAction(ISD::DYNAMIC_STACKALLOC, 518 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 519 (Subtarget->isTargetCOFF() 520 && !Subtarget->isTargetEnvMacho() 521 ? Custom : Expand)); 522 523 if (!UseSoftFloat && X86ScalarSSEf64) { 524 // f32 and f64 use SSE. 525 // Set up the FP register classes. 526 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 527 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 528 529 // Use ANDPD to simulate FABS. 530 setOperationAction(ISD::FABS , MVT::f64, Custom); 531 setOperationAction(ISD::FABS , MVT::f32, Custom); 532 533 // Use XORP to simulate FNEG. 534 setOperationAction(ISD::FNEG , MVT::f64, Custom); 535 setOperationAction(ISD::FNEG , MVT::f32, Custom); 536 537 // Use ANDPD and ORPD to simulate FCOPYSIGN. 538 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 539 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 540 541 // Lower this to FGETSIGNx86 plus an AND. 542 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 543 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 544 545 // We don't support sin/cos/fmod 546 setOperationAction(ISD::FSIN , MVT::f64, Expand); 547 setOperationAction(ISD::FCOS , MVT::f64, Expand); 548 setOperationAction(ISD::FSIN , MVT::f32, Expand); 549 setOperationAction(ISD::FCOS , MVT::f32, Expand); 550 551 // Expand FP immediates into loads from the stack, except for the special 552 // cases we handle. 553 addLegalFPImmediate(APFloat(+0.0)); // xorpd 554 addLegalFPImmediate(APFloat(+0.0f)); // xorps 555 } else if (!UseSoftFloat && X86ScalarSSEf32) { 556 // Use SSE for f32, x87 for f64. 557 // Set up the FP register classes. 558 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 559 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 560 561 // Use ANDPS to simulate FABS. 562 setOperationAction(ISD::FABS , MVT::f32, Custom); 563 564 // Use XORP to simulate FNEG. 565 setOperationAction(ISD::FNEG , MVT::f32, Custom); 566 567 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 568 569 // Use ANDPS and ORPS to simulate FCOPYSIGN. 570 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 571 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 572 573 // We don't support sin/cos/fmod 574 setOperationAction(ISD::FSIN , MVT::f32, Expand); 575 setOperationAction(ISD::FCOS , MVT::f32, Expand); 576 577 // Special cases we handle for FP constants. 578 addLegalFPImmediate(APFloat(+0.0f)); // xorps 579 addLegalFPImmediate(APFloat(+0.0)); // FLD0 580 addLegalFPImmediate(APFloat(+1.0)); // FLD1 581 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 582 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 583 584 if (!UnsafeFPMath) { 585 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 586 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 587 } 588 } else if (!UseSoftFloat) { 589 // f32 and f64 in x87. 590 // Set up the FP register classes. 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 593 594 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 595 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 596 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 597 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 598 599 if (!UnsafeFPMath) { 600 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 601 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 602 } 603 addLegalFPImmediate(APFloat(+0.0)); // FLD0 604 addLegalFPImmediate(APFloat(+1.0)); // FLD1 605 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 606 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 607 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 608 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 609 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 610 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 611 } 612 613 // We don't support FMA. 614 setOperationAction(ISD::FMA, MVT::f64, Expand); 615 setOperationAction(ISD::FMA, MVT::f32, Expand); 616 617 // Long double always uses X87. 618 if (!UseSoftFloat) { 619 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 620 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 621 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 622 { 623 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 624 addLegalFPImmediate(TmpFlt); // FLD0 625 TmpFlt.changeSign(); 626 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 627 628 bool ignored; 629 APFloat TmpFlt2(+1.0); 630 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 631 &ignored); 632 addLegalFPImmediate(TmpFlt2); // FLD1 633 TmpFlt2.changeSign(); 634 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 635 } 636 637 if (!UnsafeFPMath) { 638 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 639 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 640 } 641 642 setOperationAction(ISD::FMA, MVT::f80, Expand); 643 } 644 645 // Always use a library call for pow. 646 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 647 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 648 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 649 650 setOperationAction(ISD::FLOG, MVT::f80, Expand); 651 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 652 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 653 setOperationAction(ISD::FEXP, MVT::f80, Expand); 654 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 655 656 // First set operation action for all vector types to either promote 657 // (for widening) or expand (for scalarization). Then we will selectively 658 // turn on ones that can be effectively codegen'd. 659 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 660 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 661 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 662 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 663 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 664 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 665 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 666 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 667 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 668 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 669 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 670 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 671 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 672 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 673 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 674 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 675 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 676 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 677 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 678 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 679 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 680 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 681 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 682 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 683 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 684 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 685 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 686 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 687 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 702 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 704 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 705 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 711 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 715 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 716 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 717 setTruncStoreAction((MVT::SimpleValueType)VT, 718 (MVT::SimpleValueType)InnerVT, Expand); 719 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 720 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 721 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 722 } 723 724 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 725 // with -msoft-float, disable use of MMX as well. 726 if (!UseSoftFloat && Subtarget->hasMMX()) { 727 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 728 // No operations on x86mmx supported, everything uses intrinsics. 729 } 730 731 // MMX-sized vectors (other than x86mmx) are expected to be expanded 732 // into smaller operations. 733 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 734 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 735 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 736 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 737 setOperationAction(ISD::AND, MVT::v8i8, Expand); 738 setOperationAction(ISD::AND, MVT::v4i16, Expand); 739 setOperationAction(ISD::AND, MVT::v2i32, Expand); 740 setOperationAction(ISD::AND, MVT::v1i64, Expand); 741 setOperationAction(ISD::OR, MVT::v8i8, Expand); 742 setOperationAction(ISD::OR, MVT::v4i16, Expand); 743 setOperationAction(ISD::OR, MVT::v2i32, Expand); 744 setOperationAction(ISD::OR, MVT::v1i64, Expand); 745 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 746 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 747 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 748 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 750 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 751 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 752 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 753 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 754 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 755 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 756 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 757 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 758 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 759 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 760 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 761 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 762 763 if (!UseSoftFloat && Subtarget->hasXMM()) { 764 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 765 766 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 767 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 768 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 769 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 770 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 771 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 772 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 773 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 774 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 775 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 776 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 777 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 778 } 779 780 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 781 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 782 783 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 784 // registers cannot be used even for integer operations. 785 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 786 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 787 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 788 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 789 790 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 791 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 792 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 793 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 794 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 795 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 796 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 797 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 798 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 799 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 800 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 801 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 802 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 803 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 804 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 805 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 806 807 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 808 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 809 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 810 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 811 812 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 813 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 814 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 819 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 820 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 821 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 822 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 823 824 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 825 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 826 EVT VT = (MVT::SimpleValueType)i; 827 // Do not attempt to custom lower non-power-of-2 vectors 828 if (!isPowerOf2_32(VT.getVectorNumElements())) 829 continue; 830 // Do not attempt to custom lower non-128-bit vectors 831 if (!VT.is128BitVector()) 832 continue; 833 setOperationAction(ISD::BUILD_VECTOR, 834 VT.getSimpleVT().SimpleTy, Custom); 835 setOperationAction(ISD::VECTOR_SHUFFLE, 836 VT.getSimpleVT().SimpleTy, Custom); 837 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 838 VT.getSimpleVT().SimpleTy, Custom); 839 } 840 841 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 842 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 843 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 844 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 847 848 if (Subtarget->is64Bit()) { 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 851 } 852 853 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 854 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 855 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 856 EVT VT = SVT; 857 858 // Do not attempt to promote non-128-bit vectors 859 if (!VT.is128BitVector()) 860 continue; 861 862 setOperationAction(ISD::AND, SVT, Promote); 863 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 864 setOperationAction(ISD::OR, SVT, Promote); 865 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 866 setOperationAction(ISD::XOR, SVT, Promote); 867 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 868 setOperationAction(ISD::LOAD, SVT, Promote); 869 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 870 setOperationAction(ISD::SELECT, SVT, Promote); 871 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 872 } 873 874 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 875 876 // Custom lower v2i64 and v2f64 selects. 877 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 878 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 879 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 880 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 881 882 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 883 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 884 } 885 886 if (Subtarget->hasSSE41()) { 887 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 888 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 889 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 890 setOperationAction(ISD::FRINT, MVT::f32, Legal); 891 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 892 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 893 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 894 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 895 setOperationAction(ISD::FRINT, MVT::f64, Legal); 896 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 897 898 // FIXME: Do we need to handle scalar-to-vector here? 899 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 900 901 // Can turn SHL into an integer multiply. 902 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 903 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 904 905 // i8 and i16 vectors are custom , because the source register and source 906 // source memory operand types are not the same width. f32 vectors are 907 // custom since the immediate controlling the insert encodes additional 908 // information. 909 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 913 914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 915 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 916 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 917 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 918 919 if (Subtarget->is64Bit()) { 920 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 921 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 922 } 923 } 924 925 if (Subtarget->hasSSE2()) { 926 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 927 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 928 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 929 930 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 931 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 932 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 933 934 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 935 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 936 } 937 938 if (Subtarget->hasSSE42()) 939 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 940 941 if (!UseSoftFloat && Subtarget->hasAVX()) { 942 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 943 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 944 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 945 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 946 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 947 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 948 949 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 950 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 951 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 952 953 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 954 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 955 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 956 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 957 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 958 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 959 960 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 961 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 962 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 963 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 964 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 965 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 966 967 // Custom lower several nodes for 256-bit types. 968 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 969 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 970 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 971 EVT VT = SVT; 972 973 // Extract subvector is special because the value type 974 // (result) is 128-bit but the source is 256-bit wide. 975 if (VT.is128BitVector()) 976 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 977 978 // Do not attempt to custom lower other non-256-bit vectors 979 if (!VT.is256BitVector()) 980 continue; 981 982 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 983 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 984 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 985 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 986 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 987 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 988 } 989 990 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 991 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 992 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 993 EVT VT = SVT; 994 995 // Do not attempt to promote non-256-bit vectors 996 if (!VT.is256BitVector()) 997 continue; 998 999 setOperationAction(ISD::AND, SVT, Promote); 1000 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1001 setOperationAction(ISD::OR, SVT, Promote); 1002 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1003 setOperationAction(ISD::XOR, SVT, Promote); 1004 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1005 setOperationAction(ISD::LOAD, SVT, Promote); 1006 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1007 setOperationAction(ISD::SELECT, SVT, Promote); 1008 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1009 } 1010 } 1011 1012 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1013 // of this type with custom code. 1014 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1015 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1016 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); 1017 } 1018 1019 // We want to custom lower some of our intrinsics. 1020 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1021 1022 1023 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1024 // handle type legalization for these operations here. 1025 // 1026 // FIXME: We really should do custom legalization for addition and 1027 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1028 // than generic legalization for 64-bit multiplication-with-overflow, though. 1029 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1030 // Add/Sub/Mul with overflow operations are custom lowered. 1031 MVT VT = IntVTs[i]; 1032 setOperationAction(ISD::SADDO, VT, Custom); 1033 setOperationAction(ISD::UADDO, VT, Custom); 1034 setOperationAction(ISD::SSUBO, VT, Custom); 1035 setOperationAction(ISD::USUBO, VT, Custom); 1036 setOperationAction(ISD::SMULO, VT, Custom); 1037 setOperationAction(ISD::UMULO, VT, Custom); 1038 } 1039 1040 // There are no 8-bit 3-address imul/mul instructions 1041 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1042 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1043 1044 if (!Subtarget->is64Bit()) { 1045 // These libcalls are not available in 32-bit. 1046 setLibcallName(RTLIB::SHL_I128, 0); 1047 setLibcallName(RTLIB::SRL_I128, 0); 1048 setLibcallName(RTLIB::SRA_I128, 0); 1049 } 1050 1051 // We have target-specific dag combine patterns for the following nodes: 1052 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1053 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1054 setTargetDAGCombine(ISD::BUILD_VECTOR); 1055 setTargetDAGCombine(ISD::SELECT); 1056 setTargetDAGCombine(ISD::SHL); 1057 setTargetDAGCombine(ISD::SRA); 1058 setTargetDAGCombine(ISD::SRL); 1059 setTargetDAGCombine(ISD::OR); 1060 setTargetDAGCombine(ISD::AND); 1061 setTargetDAGCombine(ISD::ADD); 1062 setTargetDAGCombine(ISD::SUB); 1063 setTargetDAGCombine(ISD::STORE); 1064 setTargetDAGCombine(ISD::ZERO_EXTEND); 1065 setTargetDAGCombine(ISD::SINT_TO_FP); 1066 if (Subtarget->is64Bit()) 1067 setTargetDAGCombine(ISD::MUL); 1068 1069 computeRegisterProperties(); 1070 1071 // On Darwin, -Os means optimize for size without hurting performance, 1072 // do not reduce the limit. 1073 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1074 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1075 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1076 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1077 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1078 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1079 setPrefLoopAlignment(16); 1080 benefitFromCodePlacementOpt = true; 1081 1082 setPrefFunctionAlignment(4); 1083} 1084 1085 1086MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1087 return MVT::i8; 1088} 1089 1090 1091/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1092/// the desired ByVal argument alignment. 1093static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1094 if (MaxAlign == 16) 1095 return; 1096 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1097 if (VTy->getBitWidth() == 128) 1098 MaxAlign = 16; 1099 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1100 unsigned EltAlign = 0; 1101 getMaxByValAlign(ATy->getElementType(), EltAlign); 1102 if (EltAlign > MaxAlign) 1103 MaxAlign = EltAlign; 1104 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1105 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1106 unsigned EltAlign = 0; 1107 getMaxByValAlign(STy->getElementType(i), EltAlign); 1108 if (EltAlign > MaxAlign) 1109 MaxAlign = EltAlign; 1110 if (MaxAlign == 16) 1111 break; 1112 } 1113 } 1114 return; 1115} 1116 1117/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1118/// function arguments in the caller parameter area. For X86, aggregates 1119/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1120/// are at 4-byte boundaries. 1121unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1122 if (Subtarget->is64Bit()) { 1123 // Max of 8 and alignment of type. 1124 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1125 if (TyAlign > 8) 1126 return TyAlign; 1127 return 8; 1128 } 1129 1130 unsigned Align = 4; 1131 if (Subtarget->hasXMM()) 1132 getMaxByValAlign(Ty, Align); 1133 return Align; 1134} 1135 1136/// getOptimalMemOpType - Returns the target specific optimal type for load 1137/// and store operations as a result of memset, memcpy, and memmove 1138/// lowering. If DstAlign is zero that means it's safe to destination 1139/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1140/// means there isn't a need to check it against alignment requirement, 1141/// probably because the source does not need to be loaded. If 1142/// 'NonScalarIntSafe' is true, that means it's safe to return a 1143/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1144/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1145/// constant so it does not need to be loaded. 1146/// It returns EVT::Other if the type should be determined using generic 1147/// target-independent logic. 1148EVT 1149X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1150 unsigned DstAlign, unsigned SrcAlign, 1151 bool NonScalarIntSafe, 1152 bool MemcpyStrSrc, 1153 MachineFunction &MF) const { 1154 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1155 // linux. This is because the stack realignment code can't handle certain 1156 // cases like PR2962. This should be removed when PR2962 is fixed. 1157 const Function *F = MF.getFunction(); 1158 if (NonScalarIntSafe && 1159 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1160 if (Size >= 16 && 1161 (Subtarget->isUnalignedMemAccessFast() || 1162 ((DstAlign == 0 || DstAlign >= 16) && 1163 (SrcAlign == 0 || SrcAlign >= 16))) && 1164 Subtarget->getStackAlignment() >= 16) { 1165 if (Subtarget->hasSSE2()) 1166 return MVT::v4i32; 1167 if (Subtarget->hasSSE1()) 1168 return MVT::v4f32; 1169 } else if (!MemcpyStrSrc && Size >= 8 && 1170 !Subtarget->is64Bit() && 1171 Subtarget->getStackAlignment() >= 8 && 1172 Subtarget->hasXMMInt()) { 1173 // Do not use f64 to lower memcpy if source is string constant. It's 1174 // better to use i32 to avoid the loads. 1175 return MVT::f64; 1176 } 1177 } 1178 if (Subtarget->is64Bit() && Size >= 8) 1179 return MVT::i64; 1180 return MVT::i32; 1181} 1182 1183/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1184/// current function. The returned value is a member of the 1185/// MachineJumpTableInfo::JTEntryKind enum. 1186unsigned X86TargetLowering::getJumpTableEncoding() const { 1187 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1188 // symbol. 1189 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1190 Subtarget->isPICStyleGOT()) 1191 return MachineJumpTableInfo::EK_Custom32; 1192 1193 // Otherwise, use the normal jump table encoding heuristics. 1194 return TargetLowering::getJumpTableEncoding(); 1195} 1196 1197const MCExpr * 1198X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1199 const MachineBasicBlock *MBB, 1200 unsigned uid,MCContext &Ctx) const{ 1201 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1202 Subtarget->isPICStyleGOT()); 1203 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1204 // entries. 1205 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1206 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1207} 1208 1209/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1210/// jumptable. 1211SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1212 SelectionDAG &DAG) const { 1213 if (!Subtarget->is64Bit()) 1214 // This doesn't have DebugLoc associated with it, but is not really the 1215 // same as a Register. 1216 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1217 return Table; 1218} 1219 1220/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1221/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1222/// MCExpr. 1223const MCExpr *X86TargetLowering:: 1224getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1225 MCContext &Ctx) const { 1226 // X86-64 uses RIP relative addressing based on the jump table label. 1227 if (Subtarget->isPICStyleRIPRel()) 1228 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1229 1230 // Otherwise, the reference is relative to the PIC base. 1231 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1232} 1233 1234// FIXME: Why this routine is here? Move to RegInfo! 1235std::pair<const TargetRegisterClass*, uint8_t> 1236X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1237 const TargetRegisterClass *RRC = 0; 1238 uint8_t Cost = 1; 1239 switch (VT.getSimpleVT().SimpleTy) { 1240 default: 1241 return TargetLowering::findRepresentativeClass(VT); 1242 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1243 RRC = (Subtarget->is64Bit() 1244 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1245 break; 1246 case MVT::x86mmx: 1247 RRC = X86::VR64RegisterClass; 1248 break; 1249 case MVT::f32: case MVT::f64: 1250 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1251 case MVT::v4f32: case MVT::v2f64: 1252 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1253 case MVT::v4f64: 1254 RRC = X86::VR128RegisterClass; 1255 break; 1256 } 1257 return std::make_pair(RRC, Cost); 1258} 1259 1260bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1261 unsigned &Offset) const { 1262 if (!Subtarget->isTargetLinux()) 1263 return false; 1264 1265 if (Subtarget->is64Bit()) { 1266 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1267 Offset = 0x28; 1268 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1269 AddressSpace = 256; 1270 else 1271 AddressSpace = 257; 1272 } else { 1273 // %gs:0x14 on i386 1274 Offset = 0x14; 1275 AddressSpace = 256; 1276 } 1277 return true; 1278} 1279 1280 1281//===----------------------------------------------------------------------===// 1282// Return Value Calling Convention Implementation 1283//===----------------------------------------------------------------------===// 1284 1285#include "X86GenCallingConv.inc" 1286 1287bool 1288X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1289 MachineFunction &MF, bool isVarArg, 1290 const SmallVectorImpl<ISD::OutputArg> &Outs, 1291 LLVMContext &Context) const { 1292 SmallVector<CCValAssign, 16> RVLocs; 1293 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1294 RVLocs, Context); 1295 return CCInfo.CheckReturn(Outs, RetCC_X86); 1296} 1297 1298SDValue 1299X86TargetLowering::LowerReturn(SDValue Chain, 1300 CallingConv::ID CallConv, bool isVarArg, 1301 const SmallVectorImpl<ISD::OutputArg> &Outs, 1302 const SmallVectorImpl<SDValue> &OutVals, 1303 DebugLoc dl, SelectionDAG &DAG) const { 1304 MachineFunction &MF = DAG.getMachineFunction(); 1305 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1306 1307 SmallVector<CCValAssign, 16> RVLocs; 1308 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1309 RVLocs, *DAG.getContext()); 1310 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1311 1312 // Add the regs to the liveout set for the function. 1313 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1314 for (unsigned i = 0; i != RVLocs.size(); ++i) 1315 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1316 MRI.addLiveOut(RVLocs[i].getLocReg()); 1317 1318 SDValue Flag; 1319 1320 SmallVector<SDValue, 6> RetOps; 1321 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1322 // Operand #1 = Bytes To Pop 1323 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1324 MVT::i16)); 1325 1326 // Copy the result values into the output registers. 1327 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1328 CCValAssign &VA = RVLocs[i]; 1329 assert(VA.isRegLoc() && "Can only return in registers!"); 1330 SDValue ValToCopy = OutVals[i]; 1331 EVT ValVT = ValToCopy.getValueType(); 1332 1333 // If this is x86-64, and we disabled SSE, we can't return FP values, 1334 // or SSE or MMX vectors. 1335 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1336 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1337 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1338 report_fatal_error("SSE register return with SSE disabled"); 1339 } 1340 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1341 // llvm-gcc has never done it right and no one has noticed, so this 1342 // should be OK for now. 1343 if (ValVT == MVT::f64 && 1344 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1345 report_fatal_error("SSE2 register return with SSE2 disabled"); 1346 1347 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1348 // the RET instruction and handled by the FP Stackifier. 1349 if (VA.getLocReg() == X86::ST0 || 1350 VA.getLocReg() == X86::ST1) { 1351 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1352 // change the value to the FP stack register class. 1353 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1354 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1355 RetOps.push_back(ValToCopy); 1356 // Don't emit a copytoreg. 1357 continue; 1358 } 1359 1360 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1361 // which is returned in RAX / RDX. 1362 if (Subtarget->is64Bit()) { 1363 if (ValVT == MVT::x86mmx) { 1364 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1365 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1366 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1367 ValToCopy); 1368 // If we don't have SSE2 available, convert to v4f32 so the generated 1369 // register is legal. 1370 if (!Subtarget->hasSSE2()) 1371 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1372 } 1373 } 1374 } 1375 1376 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1377 Flag = Chain.getValue(1); 1378 } 1379 1380 // The x86-64 ABI for returning structs by value requires that we copy 1381 // the sret argument into %rax for the return. We saved the argument into 1382 // a virtual register in the entry block, so now we copy the value out 1383 // and into %rax. 1384 if (Subtarget->is64Bit() && 1385 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1386 MachineFunction &MF = DAG.getMachineFunction(); 1387 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1388 unsigned Reg = FuncInfo->getSRetReturnReg(); 1389 assert(Reg && 1390 "SRetReturnReg should have been set in LowerFormalArguments()."); 1391 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1392 1393 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1394 Flag = Chain.getValue(1); 1395 1396 // RAX now acts like a return value. 1397 MRI.addLiveOut(X86::RAX); 1398 } 1399 1400 RetOps[0] = Chain; // Update chain. 1401 1402 // Add the flag if we have it. 1403 if (Flag.getNode()) 1404 RetOps.push_back(Flag); 1405 1406 return DAG.getNode(X86ISD::RET_FLAG, dl, 1407 MVT::Other, &RetOps[0], RetOps.size()); 1408} 1409 1410bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1411 if (N->getNumValues() != 1) 1412 return false; 1413 if (!N->hasNUsesOfValue(1, 0)) 1414 return false; 1415 1416 SDNode *Copy = *N->use_begin(); 1417 if (Copy->getOpcode() != ISD::CopyToReg && 1418 Copy->getOpcode() != ISD::FP_EXTEND) 1419 return false; 1420 1421 bool HasRet = false; 1422 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1423 UI != UE; ++UI) { 1424 if (UI->getOpcode() != X86ISD::RET_FLAG) 1425 return false; 1426 HasRet = true; 1427 } 1428 1429 return HasRet; 1430} 1431 1432EVT 1433X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1434 ISD::NodeType ExtendKind) const { 1435 MVT ReturnMVT; 1436 // TODO: Is this also valid on 32-bit? 1437 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1438 ReturnMVT = MVT::i8; 1439 else 1440 ReturnMVT = MVT::i32; 1441 1442 EVT MinVT = getRegisterType(Context, ReturnMVT); 1443 return VT.bitsLT(MinVT) ? MinVT : VT; 1444} 1445 1446/// LowerCallResult - Lower the result values of a call into the 1447/// appropriate copies out of appropriate physical registers. 1448/// 1449SDValue 1450X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1451 CallingConv::ID CallConv, bool isVarArg, 1452 const SmallVectorImpl<ISD::InputArg> &Ins, 1453 DebugLoc dl, SelectionDAG &DAG, 1454 SmallVectorImpl<SDValue> &InVals) const { 1455 1456 // Assign locations to each value returned by this call. 1457 SmallVector<CCValAssign, 16> RVLocs; 1458 bool Is64Bit = Subtarget->is64Bit(); 1459 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1460 getTargetMachine(), RVLocs, *DAG.getContext()); 1461 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1462 1463 // Copy all of the result registers out of their specified physreg. 1464 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1465 CCValAssign &VA = RVLocs[i]; 1466 EVT CopyVT = VA.getValVT(); 1467 1468 // If this is x86-64, and we disabled SSE, we can't return FP values 1469 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1470 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1471 report_fatal_error("SSE register return with SSE disabled"); 1472 } 1473 1474 SDValue Val; 1475 1476 // If this is a call to a function that returns an fp value on the floating 1477 // point stack, we must guarantee the the value is popped from the stack, so 1478 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1479 // if the return value is not used. We use the FpPOP_RETVAL instruction 1480 // instead. 1481 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1482 // If we prefer to use the value in xmm registers, copy it out as f80 and 1483 // use a truncate to move it from fp stack reg to xmm reg. 1484 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1485 SDValue Ops[] = { Chain, InFlag }; 1486 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1487 MVT::Other, MVT::Glue, Ops, 2), 1); 1488 Val = Chain.getValue(0); 1489 1490 // Round the f80 to the right size, which also moves it to the appropriate 1491 // xmm register. 1492 if (CopyVT != VA.getValVT()) 1493 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1494 // This truncation won't change the value. 1495 DAG.getIntPtrConstant(1)); 1496 } else { 1497 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1498 CopyVT, InFlag).getValue(1); 1499 Val = Chain.getValue(0); 1500 } 1501 InFlag = Chain.getValue(2); 1502 InVals.push_back(Val); 1503 } 1504 1505 return Chain; 1506} 1507 1508 1509//===----------------------------------------------------------------------===// 1510// C & StdCall & Fast Calling Convention implementation 1511//===----------------------------------------------------------------------===// 1512// StdCall calling convention seems to be standard for many Windows' API 1513// routines and around. It differs from C calling convention just a little: 1514// callee should clean up the stack, not caller. Symbols should be also 1515// decorated in some fancy way :) It doesn't support any vector arguments. 1516// For info on fast calling convention see Fast Calling Convention (tail call) 1517// implementation LowerX86_32FastCCCallTo. 1518 1519/// CallIsStructReturn - Determines whether a call uses struct return 1520/// semantics. 1521static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1522 if (Outs.empty()) 1523 return false; 1524 1525 return Outs[0].Flags.isSRet(); 1526} 1527 1528/// ArgsAreStructReturn - Determines whether a function uses struct 1529/// return semantics. 1530static bool 1531ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1532 if (Ins.empty()) 1533 return false; 1534 1535 return Ins[0].Flags.isSRet(); 1536} 1537 1538/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1539/// by "Src" to address "Dst" with size and alignment information specified by 1540/// the specific parameter attribute. The copy will be passed as a byval 1541/// function parameter. 1542static SDValue 1543CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1544 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1545 DebugLoc dl) { 1546 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1547 1548 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1549 /*isVolatile*/false, /*AlwaysInline=*/true, 1550 MachinePointerInfo(), MachinePointerInfo()); 1551} 1552 1553/// IsTailCallConvention - Return true if the calling convention is one that 1554/// supports tail call optimization. 1555static bool IsTailCallConvention(CallingConv::ID CC) { 1556 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1557} 1558 1559bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1560 if (!CI->isTailCall()) 1561 return false; 1562 1563 CallSite CS(CI); 1564 CallingConv::ID CalleeCC = CS.getCallingConv(); 1565 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1566 return false; 1567 1568 return true; 1569} 1570 1571/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1572/// a tailcall target by changing its ABI. 1573static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1574 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1575} 1576 1577SDValue 1578X86TargetLowering::LowerMemArgument(SDValue Chain, 1579 CallingConv::ID CallConv, 1580 const SmallVectorImpl<ISD::InputArg> &Ins, 1581 DebugLoc dl, SelectionDAG &DAG, 1582 const CCValAssign &VA, 1583 MachineFrameInfo *MFI, 1584 unsigned i) const { 1585 // Create the nodes corresponding to a load from this parameter slot. 1586 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1587 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1588 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1589 EVT ValVT; 1590 1591 // If value is passed by pointer we have address passed instead of the value 1592 // itself. 1593 if (VA.getLocInfo() == CCValAssign::Indirect) 1594 ValVT = VA.getLocVT(); 1595 else 1596 ValVT = VA.getValVT(); 1597 1598 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1599 // changed with more analysis. 1600 // In case of tail call optimization mark all arguments mutable. Since they 1601 // could be overwritten by lowering of arguments in case of a tail call. 1602 if (Flags.isByVal()) { 1603 unsigned Bytes = Flags.getByValSize(); 1604 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1605 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1606 return DAG.getFrameIndex(FI, getPointerTy()); 1607 } else { 1608 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1609 VA.getLocMemOffset(), isImmutable); 1610 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1611 return DAG.getLoad(ValVT, dl, Chain, FIN, 1612 MachinePointerInfo::getFixedStack(FI), 1613 false, false, 0); 1614 } 1615} 1616 1617SDValue 1618X86TargetLowering::LowerFormalArguments(SDValue Chain, 1619 CallingConv::ID CallConv, 1620 bool isVarArg, 1621 const SmallVectorImpl<ISD::InputArg> &Ins, 1622 DebugLoc dl, 1623 SelectionDAG &DAG, 1624 SmallVectorImpl<SDValue> &InVals) 1625 const { 1626 MachineFunction &MF = DAG.getMachineFunction(); 1627 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1628 1629 const Function* Fn = MF.getFunction(); 1630 if (Fn->hasExternalLinkage() && 1631 Subtarget->isTargetCygMing() && 1632 Fn->getName() == "main") 1633 FuncInfo->setForceFramePointer(true); 1634 1635 MachineFrameInfo *MFI = MF.getFrameInfo(); 1636 bool Is64Bit = Subtarget->is64Bit(); 1637 bool IsWin64 = Subtarget->isTargetWin64(); 1638 1639 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1640 "Var args not supported with calling convention fastcc or ghc"); 1641 1642 // Assign locations to all of the incoming arguments. 1643 SmallVector<CCValAssign, 16> ArgLocs; 1644 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1645 ArgLocs, *DAG.getContext()); 1646 1647 // Allocate shadow area for Win64 1648 if (IsWin64) { 1649 CCInfo.AllocateStack(32, 8); 1650 } 1651 1652 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1653 1654 unsigned LastVal = ~0U; 1655 SDValue ArgValue; 1656 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1657 CCValAssign &VA = ArgLocs[i]; 1658 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1659 // places. 1660 assert(VA.getValNo() != LastVal && 1661 "Don't support value assigned to multiple locs yet"); 1662 LastVal = VA.getValNo(); 1663 1664 if (VA.isRegLoc()) { 1665 EVT RegVT = VA.getLocVT(); 1666 TargetRegisterClass *RC = NULL; 1667 if (RegVT == MVT::i32) 1668 RC = X86::GR32RegisterClass; 1669 else if (Is64Bit && RegVT == MVT::i64) 1670 RC = X86::GR64RegisterClass; 1671 else if (RegVT == MVT::f32) 1672 RC = X86::FR32RegisterClass; 1673 else if (RegVT == MVT::f64) 1674 RC = X86::FR64RegisterClass; 1675 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1676 RC = X86::VR256RegisterClass; 1677 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1678 RC = X86::VR128RegisterClass; 1679 else if (RegVT == MVT::x86mmx) 1680 RC = X86::VR64RegisterClass; 1681 else 1682 llvm_unreachable("Unknown argument type!"); 1683 1684 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1685 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1686 1687 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1688 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1689 // right size. 1690 if (VA.getLocInfo() == CCValAssign::SExt) 1691 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1692 DAG.getValueType(VA.getValVT())); 1693 else if (VA.getLocInfo() == CCValAssign::ZExt) 1694 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1695 DAG.getValueType(VA.getValVT())); 1696 else if (VA.getLocInfo() == CCValAssign::BCvt) 1697 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1698 1699 if (VA.isExtInLoc()) { 1700 // Handle MMX values passed in XMM regs. 1701 if (RegVT.isVector()) { 1702 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1703 ArgValue); 1704 } else 1705 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1706 } 1707 } else { 1708 assert(VA.isMemLoc()); 1709 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1710 } 1711 1712 // If value is passed via pointer - do a load. 1713 if (VA.getLocInfo() == CCValAssign::Indirect) 1714 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1715 MachinePointerInfo(), false, false, 0); 1716 1717 InVals.push_back(ArgValue); 1718 } 1719 1720 // The x86-64 ABI for returning structs by value requires that we copy 1721 // the sret argument into %rax for the return. Save the argument into 1722 // a virtual register so that we can access it from the return points. 1723 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1724 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1725 unsigned Reg = FuncInfo->getSRetReturnReg(); 1726 if (!Reg) { 1727 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1728 FuncInfo->setSRetReturnReg(Reg); 1729 } 1730 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1731 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1732 } 1733 1734 unsigned StackSize = CCInfo.getNextStackOffset(); 1735 // Align stack specially for tail calls. 1736 if (FuncIsMadeTailCallSafe(CallConv)) 1737 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1738 1739 // If the function takes variable number of arguments, make a frame index for 1740 // the start of the first vararg value... for expansion of llvm.va_start. 1741 if (isVarArg) { 1742 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1743 CallConv != CallingConv::X86_ThisCall)) { 1744 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1745 } 1746 if (Is64Bit) { 1747 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1748 1749 // FIXME: We should really autogenerate these arrays 1750 static const unsigned GPR64ArgRegsWin64[] = { 1751 X86::RCX, X86::RDX, X86::R8, X86::R9 1752 }; 1753 static const unsigned GPR64ArgRegs64Bit[] = { 1754 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1755 }; 1756 static const unsigned XMMArgRegs64Bit[] = { 1757 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1758 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1759 }; 1760 const unsigned *GPR64ArgRegs; 1761 unsigned NumXMMRegs = 0; 1762 1763 if (IsWin64) { 1764 // The XMM registers which might contain var arg parameters are shadowed 1765 // in their paired GPR. So we only need to save the GPR to their home 1766 // slots. 1767 TotalNumIntRegs = 4; 1768 GPR64ArgRegs = GPR64ArgRegsWin64; 1769 } else { 1770 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1771 GPR64ArgRegs = GPR64ArgRegs64Bit; 1772 1773 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1774 } 1775 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1776 TotalNumIntRegs); 1777 1778 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1779 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1780 "SSE register cannot be used when SSE is disabled!"); 1781 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1782 "SSE register cannot be used when SSE is disabled!"); 1783 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1784 // Kernel mode asks for SSE to be disabled, so don't push them 1785 // on the stack. 1786 TotalNumXMMRegs = 0; 1787 1788 if (IsWin64) { 1789 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1790 // Get to the caller-allocated home save location. Add 8 to account 1791 // for the return address. 1792 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1793 FuncInfo->setRegSaveFrameIndex( 1794 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1795 // Fixup to set vararg frame on shadow area (4 x i64). 1796 if (NumIntRegs < 4) 1797 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1798 } else { 1799 // For X86-64, if there are vararg parameters that are passed via 1800 // registers, then we must store them to their spots on the stack so they 1801 // may be loaded by deferencing the result of va_next. 1802 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1803 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1804 FuncInfo->setRegSaveFrameIndex( 1805 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1806 false)); 1807 } 1808 1809 // Store the integer parameter registers. 1810 SmallVector<SDValue, 8> MemOps; 1811 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1812 getPointerTy()); 1813 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1814 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1815 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1816 DAG.getIntPtrConstant(Offset)); 1817 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1818 X86::GR64RegisterClass); 1819 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1820 SDValue Store = 1821 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1822 MachinePointerInfo::getFixedStack( 1823 FuncInfo->getRegSaveFrameIndex(), Offset), 1824 false, false, 0); 1825 MemOps.push_back(Store); 1826 Offset += 8; 1827 } 1828 1829 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1830 // Now store the XMM (fp + vector) parameter registers. 1831 SmallVector<SDValue, 11> SaveXMMOps; 1832 SaveXMMOps.push_back(Chain); 1833 1834 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1835 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1836 SaveXMMOps.push_back(ALVal); 1837 1838 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1839 FuncInfo->getRegSaveFrameIndex())); 1840 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1841 FuncInfo->getVarArgsFPOffset())); 1842 1843 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1844 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1845 X86::VR128RegisterClass); 1846 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1847 SaveXMMOps.push_back(Val); 1848 } 1849 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1850 MVT::Other, 1851 &SaveXMMOps[0], SaveXMMOps.size())); 1852 } 1853 1854 if (!MemOps.empty()) 1855 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1856 &MemOps[0], MemOps.size()); 1857 } 1858 } 1859 1860 // Some CCs need callee pop. 1861 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { 1862 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1863 } else { 1864 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1865 // If this is an sret function, the return should pop the hidden pointer. 1866 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1867 FuncInfo->setBytesToPopOnReturn(4); 1868 } 1869 1870 if (!Is64Bit) { 1871 // RegSaveFrameIndex is X86-64 only. 1872 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1873 if (CallConv == CallingConv::X86_FastCall || 1874 CallConv == CallingConv::X86_ThisCall) 1875 // fastcc functions can't have varargs. 1876 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1877 } 1878 1879 return Chain; 1880} 1881 1882SDValue 1883X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1884 SDValue StackPtr, SDValue Arg, 1885 DebugLoc dl, SelectionDAG &DAG, 1886 const CCValAssign &VA, 1887 ISD::ArgFlagsTy Flags) const { 1888 unsigned LocMemOffset = VA.getLocMemOffset(); 1889 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1890 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1891 if (Flags.isByVal()) 1892 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1893 1894 return DAG.getStore(Chain, dl, Arg, PtrOff, 1895 MachinePointerInfo::getStack(LocMemOffset), 1896 false, false, 0); 1897} 1898 1899/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1900/// optimization is performed and it is required. 1901SDValue 1902X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1903 SDValue &OutRetAddr, SDValue Chain, 1904 bool IsTailCall, bool Is64Bit, 1905 int FPDiff, DebugLoc dl) const { 1906 // Adjust the Return address stack slot. 1907 EVT VT = getPointerTy(); 1908 OutRetAddr = getReturnAddressFrameIndex(DAG); 1909 1910 // Load the "old" Return address. 1911 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1912 false, false, 0); 1913 return SDValue(OutRetAddr.getNode(), 1); 1914} 1915 1916/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1917/// optimization is performed and it is required (FPDiff!=0). 1918static SDValue 1919EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1920 SDValue Chain, SDValue RetAddrFrIdx, 1921 bool Is64Bit, int FPDiff, DebugLoc dl) { 1922 // Store the return address to the appropriate stack slot. 1923 if (!FPDiff) return Chain; 1924 // Calculate the new stack slot for the return address. 1925 int SlotSize = Is64Bit ? 8 : 4; 1926 int NewReturnAddrFI = 1927 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1928 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1929 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1930 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1931 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1932 false, false, 0); 1933 return Chain; 1934} 1935 1936SDValue 1937X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1938 CallingConv::ID CallConv, bool isVarArg, 1939 bool &isTailCall, 1940 const SmallVectorImpl<ISD::OutputArg> &Outs, 1941 const SmallVectorImpl<SDValue> &OutVals, 1942 const SmallVectorImpl<ISD::InputArg> &Ins, 1943 DebugLoc dl, SelectionDAG &DAG, 1944 SmallVectorImpl<SDValue> &InVals) const { 1945 MachineFunction &MF = DAG.getMachineFunction(); 1946 bool Is64Bit = Subtarget->is64Bit(); 1947 bool IsWin64 = Subtarget->isTargetWin64(); 1948 bool IsStructRet = CallIsStructReturn(Outs); 1949 bool IsSibcall = false; 1950 1951 if (isTailCall) { 1952 // Check if it's really possible to do a tail call. 1953 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1954 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1955 Outs, OutVals, Ins, DAG); 1956 1957 // Sibcalls are automatically detected tailcalls which do not require 1958 // ABI changes. 1959 if (!GuaranteedTailCallOpt && isTailCall) 1960 IsSibcall = true; 1961 1962 if (isTailCall) 1963 ++NumTailCalls; 1964 } 1965 1966 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1967 "Var args not supported with calling convention fastcc or ghc"); 1968 1969 // Analyze operands of the call, assigning locations to each operand. 1970 SmallVector<CCValAssign, 16> ArgLocs; 1971 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1972 ArgLocs, *DAG.getContext()); 1973 1974 // Allocate shadow area for Win64 1975 if (IsWin64) { 1976 CCInfo.AllocateStack(32, 8); 1977 } 1978 1979 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1980 1981 // Get a count of how many bytes are to be pushed on the stack. 1982 unsigned NumBytes = CCInfo.getNextStackOffset(); 1983 if (IsSibcall) 1984 // This is a sibcall. The memory operands are available in caller's 1985 // own caller's stack. 1986 NumBytes = 0; 1987 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1988 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1989 1990 int FPDiff = 0; 1991 if (isTailCall && !IsSibcall) { 1992 // Lower arguments at fp - stackoffset + fpdiff. 1993 unsigned NumBytesCallerPushed = 1994 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1995 FPDiff = NumBytesCallerPushed - NumBytes; 1996 1997 // Set the delta of movement of the returnaddr stackslot. 1998 // But only set if delta is greater than previous delta. 1999 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2000 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2001 } 2002 2003 if (!IsSibcall) 2004 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2005 2006 SDValue RetAddrFrIdx; 2007 // Load return address for tail calls. 2008 if (isTailCall && FPDiff) 2009 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2010 Is64Bit, FPDiff, dl); 2011 2012 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2013 SmallVector<SDValue, 8> MemOpChains; 2014 SDValue StackPtr; 2015 2016 // Walk the register/memloc assignments, inserting copies/loads. In the case 2017 // of tail call optimization arguments are handle later. 2018 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2019 CCValAssign &VA = ArgLocs[i]; 2020 EVT RegVT = VA.getLocVT(); 2021 SDValue Arg = OutVals[i]; 2022 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2023 bool isByVal = Flags.isByVal(); 2024 2025 // Promote the value if needed. 2026 switch (VA.getLocInfo()) { 2027 default: llvm_unreachable("Unknown loc info!"); 2028 case CCValAssign::Full: break; 2029 case CCValAssign::SExt: 2030 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2031 break; 2032 case CCValAssign::ZExt: 2033 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2034 break; 2035 case CCValAssign::AExt: 2036 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2037 // Special case: passing MMX values in XMM registers. 2038 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2039 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2040 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2041 } else 2042 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2043 break; 2044 case CCValAssign::BCvt: 2045 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2046 break; 2047 case CCValAssign::Indirect: { 2048 // Store the argument. 2049 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2050 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2051 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2052 MachinePointerInfo::getFixedStack(FI), 2053 false, false, 0); 2054 Arg = SpillSlot; 2055 break; 2056 } 2057 } 2058 2059 if (VA.isRegLoc()) { 2060 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2061 if (isVarArg && IsWin64) { 2062 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2063 // shadow reg if callee is a varargs function. 2064 unsigned ShadowReg = 0; 2065 switch (VA.getLocReg()) { 2066 case X86::XMM0: ShadowReg = X86::RCX; break; 2067 case X86::XMM1: ShadowReg = X86::RDX; break; 2068 case X86::XMM2: ShadowReg = X86::R8; break; 2069 case X86::XMM3: ShadowReg = X86::R9; break; 2070 } 2071 if (ShadowReg) 2072 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2073 } 2074 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2075 assert(VA.isMemLoc()); 2076 if (StackPtr.getNode() == 0) 2077 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2078 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2079 dl, DAG, VA, Flags)); 2080 } 2081 } 2082 2083 if (!MemOpChains.empty()) 2084 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2085 &MemOpChains[0], MemOpChains.size()); 2086 2087 // Build a sequence of copy-to-reg nodes chained together with token chain 2088 // and flag operands which copy the outgoing args into registers. 2089 SDValue InFlag; 2090 // Tail call byval lowering might overwrite argument registers so in case of 2091 // tail call optimization the copies to registers are lowered later. 2092 if (!isTailCall) 2093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2095 RegsToPass[i].second, InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 2099 if (Subtarget->isPICStyleGOT()) { 2100 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2101 // GOT pointer. 2102 if (!isTailCall) { 2103 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2104 DAG.getNode(X86ISD::GlobalBaseReg, 2105 DebugLoc(), getPointerTy()), 2106 InFlag); 2107 InFlag = Chain.getValue(1); 2108 } else { 2109 // If we are tail calling and generating PIC/GOT style code load the 2110 // address of the callee into ECX. The value in ecx is used as target of 2111 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2112 // for tail calls on PIC/GOT architectures. Normally we would just put the 2113 // address of GOT into ebx and then call target@PLT. But for tail calls 2114 // ebx would be restored (since ebx is callee saved) before jumping to the 2115 // target@PLT. 2116 2117 // Note: The actual moving to ECX is done further down. 2118 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2119 if (G && !G->getGlobal()->hasHiddenVisibility() && 2120 !G->getGlobal()->hasProtectedVisibility()) 2121 Callee = LowerGlobalAddress(Callee, DAG); 2122 else if (isa<ExternalSymbolSDNode>(Callee)) 2123 Callee = LowerExternalSymbol(Callee, DAG); 2124 } 2125 } 2126 2127 if (Is64Bit && isVarArg && !IsWin64) { 2128 // From AMD64 ABI document: 2129 // For calls that may call functions that use varargs or stdargs 2130 // (prototype-less calls or calls to functions containing ellipsis (...) in 2131 // the declaration) %al is used as hidden argument to specify the number 2132 // of SSE registers used. The contents of %al do not need to match exactly 2133 // the number of registers, but must be an ubound on the number of SSE 2134 // registers used and is in the range 0 - 8 inclusive. 2135 2136 // Count the number of XMM registers allocated. 2137 static const unsigned XMMArgRegs[] = { 2138 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2139 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2140 }; 2141 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2142 assert((Subtarget->hasXMM() || !NumXMMRegs) 2143 && "SSE registers cannot be used when SSE is disabled"); 2144 2145 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2146 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2147 InFlag = Chain.getValue(1); 2148 } 2149 2150 2151 // For tail calls lower the arguments to the 'real' stack slot. 2152 if (isTailCall) { 2153 // Force all the incoming stack arguments to be loaded from the stack 2154 // before any new outgoing arguments are stored to the stack, because the 2155 // outgoing stack slots may alias the incoming argument stack slots, and 2156 // the alias isn't otherwise explicit. This is slightly more conservative 2157 // than necessary, because it means that each store effectively depends 2158 // on every argument instead of just those arguments it would clobber. 2159 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2160 2161 SmallVector<SDValue, 8> MemOpChains2; 2162 SDValue FIN; 2163 int FI = 0; 2164 // Do not flag preceding copytoreg stuff together with the following stuff. 2165 InFlag = SDValue(); 2166 if (GuaranteedTailCallOpt) { 2167 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2168 CCValAssign &VA = ArgLocs[i]; 2169 if (VA.isRegLoc()) 2170 continue; 2171 assert(VA.isMemLoc()); 2172 SDValue Arg = OutVals[i]; 2173 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2174 // Create frame index. 2175 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2176 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2177 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2178 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2179 2180 if (Flags.isByVal()) { 2181 // Copy relative to framepointer. 2182 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2183 if (StackPtr.getNode() == 0) 2184 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2185 getPointerTy()); 2186 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2187 2188 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2189 ArgChain, 2190 Flags, DAG, dl)); 2191 } else { 2192 // Store relative to framepointer. 2193 MemOpChains2.push_back( 2194 DAG.getStore(ArgChain, dl, Arg, FIN, 2195 MachinePointerInfo::getFixedStack(FI), 2196 false, false, 0)); 2197 } 2198 } 2199 } 2200 2201 if (!MemOpChains2.empty()) 2202 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2203 &MemOpChains2[0], MemOpChains2.size()); 2204 2205 // Copy arguments to their registers. 2206 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2207 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2208 RegsToPass[i].second, InFlag); 2209 InFlag = Chain.getValue(1); 2210 } 2211 InFlag =SDValue(); 2212 2213 // Store the return address to the appropriate stack slot. 2214 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2215 FPDiff, dl); 2216 } 2217 2218 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2219 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2220 // In the 64-bit large code model, we have to make all calls 2221 // through a register, since the call instruction's 32-bit 2222 // pc-relative offset may not be large enough to hold the whole 2223 // address. 2224 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2225 // If the callee is a GlobalAddress node (quite common, every direct call 2226 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2227 // it. 2228 2229 // We should use extra load for direct calls to dllimported functions in 2230 // non-JIT mode. 2231 const GlobalValue *GV = G->getGlobal(); 2232 if (!GV->hasDLLImportLinkage()) { 2233 unsigned char OpFlags = 0; 2234 bool ExtraLoad = false; 2235 unsigned WrapperKind = ISD::DELETED_NODE; 2236 2237 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2238 // external symbols most go through the PLT in PIC mode. If the symbol 2239 // has hidden or protected visibility, or if it is static or local, then 2240 // we don't need to use the PLT - we can directly call it. 2241 if (Subtarget->isTargetELF() && 2242 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2243 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2244 OpFlags = X86II::MO_PLT; 2245 } else if (Subtarget->isPICStyleStubAny() && 2246 (GV->isDeclaration() || GV->isWeakForLinker()) && 2247 (!Subtarget->getTargetTriple().isMacOSX() || 2248 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2249 // PC-relative references to external symbols should go through $stub, 2250 // unless we're building with the leopard linker or later, which 2251 // automatically synthesizes these stubs. 2252 OpFlags = X86II::MO_DARWIN_STUB; 2253 } else if (Subtarget->isPICStyleRIPRel() && 2254 isa<Function>(GV) && 2255 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2256 // If the function is marked as non-lazy, generate an indirect call 2257 // which loads from the GOT directly. This avoids runtime overhead 2258 // at the cost of eager binding (and one extra byte of encoding). 2259 OpFlags = X86II::MO_GOTPCREL; 2260 WrapperKind = X86ISD::WrapperRIP; 2261 ExtraLoad = true; 2262 } 2263 2264 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2265 G->getOffset(), OpFlags); 2266 2267 // Add a wrapper if needed. 2268 if (WrapperKind != ISD::DELETED_NODE) 2269 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2270 // Add extra indirection if needed. 2271 if (ExtraLoad) 2272 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2273 MachinePointerInfo::getGOT(), 2274 false, false, 0); 2275 } 2276 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2277 unsigned char OpFlags = 0; 2278 2279 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2280 // external symbols should go through the PLT. 2281 if (Subtarget->isTargetELF() && 2282 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2283 OpFlags = X86II::MO_PLT; 2284 } else if (Subtarget->isPICStyleStubAny() && 2285 (!Subtarget->getTargetTriple().isMacOSX() || 2286 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2287 // PC-relative references to external symbols should go through $stub, 2288 // unless we're building with the leopard linker or later, which 2289 // automatically synthesizes these stubs. 2290 OpFlags = X86II::MO_DARWIN_STUB; 2291 } 2292 2293 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2294 OpFlags); 2295 } 2296 2297 // Returns a chain & a flag for retval copy to use. 2298 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2299 SmallVector<SDValue, 8> Ops; 2300 2301 if (!IsSibcall && isTailCall) { 2302 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2303 DAG.getIntPtrConstant(0, true), InFlag); 2304 InFlag = Chain.getValue(1); 2305 } 2306 2307 Ops.push_back(Chain); 2308 Ops.push_back(Callee); 2309 2310 if (isTailCall) 2311 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2312 2313 // Add argument registers to the end of the list so that they are known live 2314 // into the call. 2315 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2316 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2317 RegsToPass[i].second.getValueType())); 2318 2319 // Add an implicit use GOT pointer in EBX. 2320 if (!isTailCall && Subtarget->isPICStyleGOT()) 2321 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2322 2323 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2324 if (Is64Bit && isVarArg && !IsWin64) 2325 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2326 2327 if (InFlag.getNode()) 2328 Ops.push_back(InFlag); 2329 2330 if (isTailCall) { 2331 // We used to do: 2332 //// If this is the first return lowered for this function, add the regs 2333 //// to the liveout set for the function. 2334 // This isn't right, although it's probably harmless on x86; liveouts 2335 // should be computed from returns not tail calls. Consider a void 2336 // function making a tail call to a function returning int. 2337 return DAG.getNode(X86ISD::TC_RETURN, dl, 2338 NodeTys, &Ops[0], Ops.size()); 2339 } 2340 2341 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2342 InFlag = Chain.getValue(1); 2343 2344 // Create the CALLSEQ_END node. 2345 unsigned NumBytesForCalleeToPush; 2346 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) 2347 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2348 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2349 // If this is a call to a struct-return function, the callee 2350 // pops the hidden struct pointer, so we have to push it back. 2351 // This is common for Darwin/X86, Linux & Mingw32 targets. 2352 NumBytesForCalleeToPush = 4; 2353 else 2354 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2355 2356 // Returns a flag for retval copy to use. 2357 if (!IsSibcall) { 2358 Chain = DAG.getCALLSEQ_END(Chain, 2359 DAG.getIntPtrConstant(NumBytes, true), 2360 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2361 true), 2362 InFlag); 2363 InFlag = Chain.getValue(1); 2364 } 2365 2366 // Handle result values, copying them out of physregs into vregs that we 2367 // return. 2368 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2369 Ins, dl, DAG, InVals); 2370} 2371 2372 2373//===----------------------------------------------------------------------===// 2374// Fast Calling Convention (tail call) implementation 2375//===----------------------------------------------------------------------===// 2376 2377// Like std call, callee cleans arguments, convention except that ECX is 2378// reserved for storing the tail called function address. Only 2 registers are 2379// free for argument passing (inreg). Tail call optimization is performed 2380// provided: 2381// * tailcallopt is enabled 2382// * caller/callee are fastcc 2383// On X86_64 architecture with GOT-style position independent code only local 2384// (within module) calls are supported at the moment. 2385// To keep the stack aligned according to platform abi the function 2386// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2387// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2388// If a tail called function callee has more arguments than the caller the 2389// caller needs to make sure that there is room to move the RETADDR to. This is 2390// achieved by reserving an area the size of the argument delta right after the 2391// original REtADDR, but before the saved framepointer or the spilled registers 2392// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2393// stack layout: 2394// arg1 2395// arg2 2396// RETADDR 2397// [ new RETADDR 2398// move area ] 2399// (possible EBP) 2400// ESI 2401// EDI 2402// local1 .. 2403 2404/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2405/// for a 16 byte align requirement. 2406unsigned 2407X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2408 SelectionDAG& DAG) const { 2409 MachineFunction &MF = DAG.getMachineFunction(); 2410 const TargetMachine &TM = MF.getTarget(); 2411 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2412 unsigned StackAlignment = TFI.getStackAlignment(); 2413 uint64_t AlignMask = StackAlignment - 1; 2414 int64_t Offset = StackSize; 2415 uint64_t SlotSize = TD->getPointerSize(); 2416 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2417 // Number smaller than 12 so just add the difference. 2418 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2419 } else { 2420 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2421 Offset = ((~AlignMask) & Offset) + StackAlignment + 2422 (StackAlignment-SlotSize); 2423 } 2424 return Offset; 2425} 2426 2427/// MatchingStackOffset - Return true if the given stack call argument is 2428/// already available in the same position (relatively) of the caller's 2429/// incoming argument stack. 2430static 2431bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2432 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2433 const X86InstrInfo *TII) { 2434 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2435 int FI = INT_MAX; 2436 if (Arg.getOpcode() == ISD::CopyFromReg) { 2437 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2438 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2439 return false; 2440 MachineInstr *Def = MRI->getVRegDef(VR); 2441 if (!Def) 2442 return false; 2443 if (!Flags.isByVal()) { 2444 if (!TII->isLoadFromStackSlot(Def, FI)) 2445 return false; 2446 } else { 2447 unsigned Opcode = Def->getOpcode(); 2448 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2449 Def->getOperand(1).isFI()) { 2450 FI = Def->getOperand(1).getIndex(); 2451 Bytes = Flags.getByValSize(); 2452 } else 2453 return false; 2454 } 2455 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2456 if (Flags.isByVal()) 2457 // ByVal argument is passed in as a pointer but it's now being 2458 // dereferenced. e.g. 2459 // define @foo(%struct.X* %A) { 2460 // tail call @bar(%struct.X* byval %A) 2461 // } 2462 return false; 2463 SDValue Ptr = Ld->getBasePtr(); 2464 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2465 if (!FINode) 2466 return false; 2467 FI = FINode->getIndex(); 2468 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2469 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2470 FI = FINode->getIndex(); 2471 Bytes = Flags.getByValSize(); 2472 } else 2473 return false; 2474 2475 assert(FI != INT_MAX); 2476 if (!MFI->isFixedObjectIndex(FI)) 2477 return false; 2478 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2479} 2480 2481/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2482/// for tail call optimization. Targets which want to do tail call 2483/// optimization should implement this function. 2484bool 2485X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2486 CallingConv::ID CalleeCC, 2487 bool isVarArg, 2488 bool isCalleeStructRet, 2489 bool isCallerStructRet, 2490 const SmallVectorImpl<ISD::OutputArg> &Outs, 2491 const SmallVectorImpl<SDValue> &OutVals, 2492 const SmallVectorImpl<ISD::InputArg> &Ins, 2493 SelectionDAG& DAG) const { 2494 if (!IsTailCallConvention(CalleeCC) && 2495 CalleeCC != CallingConv::C) 2496 return false; 2497 2498 // If -tailcallopt is specified, make fastcc functions tail-callable. 2499 const MachineFunction &MF = DAG.getMachineFunction(); 2500 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2501 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2502 bool CCMatch = CallerCC == CalleeCC; 2503 2504 if (GuaranteedTailCallOpt) { 2505 if (IsTailCallConvention(CalleeCC) && CCMatch) 2506 return true; 2507 return false; 2508 } 2509 2510 // Look for obvious safe cases to perform tail call optimization that do not 2511 // require ABI changes. This is what gcc calls sibcall. 2512 2513 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2514 // emit a special epilogue. 2515 if (RegInfo->needsStackRealignment(MF)) 2516 return false; 2517 2518 // Also avoid sibcall optimization if either caller or callee uses struct 2519 // return semantics. 2520 if (isCalleeStructRet || isCallerStructRet) 2521 return false; 2522 2523 // An stdcall caller is expected to clean up its arguments; the callee 2524 // isn't going to do that. 2525 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2526 return false; 2527 2528 // Do not sibcall optimize vararg calls unless all arguments are passed via 2529 // registers. 2530 if (isVarArg && !Outs.empty()) { 2531 2532 // Optimizing for varargs on Win64 is unlikely to be safe without 2533 // additional testing. 2534 if (Subtarget->isTargetWin64()) 2535 return false; 2536 2537 SmallVector<CCValAssign, 16> ArgLocs; 2538 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2539 getTargetMachine(), ArgLocs, *DAG.getContext()); 2540 2541 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2542 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2543 if (!ArgLocs[i].isRegLoc()) 2544 return false; 2545 } 2546 2547 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2548 // Therefore if it's not used by the call it is not safe to optimize this into 2549 // a sibcall. 2550 bool Unused = false; 2551 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2552 if (!Ins[i].Used) { 2553 Unused = true; 2554 break; 2555 } 2556 } 2557 if (Unused) { 2558 SmallVector<CCValAssign, 16> RVLocs; 2559 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2560 getTargetMachine(), RVLocs, *DAG.getContext()); 2561 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2562 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2563 CCValAssign &VA = RVLocs[i]; 2564 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2565 return false; 2566 } 2567 } 2568 2569 // If the calling conventions do not match, then we'd better make sure the 2570 // results are returned in the same way as what the caller expects. 2571 if (!CCMatch) { 2572 SmallVector<CCValAssign, 16> RVLocs1; 2573 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2574 getTargetMachine(), RVLocs1, *DAG.getContext()); 2575 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2576 2577 SmallVector<CCValAssign, 16> RVLocs2; 2578 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2579 getTargetMachine(), RVLocs2, *DAG.getContext()); 2580 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2581 2582 if (RVLocs1.size() != RVLocs2.size()) 2583 return false; 2584 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2585 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2586 return false; 2587 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2588 return false; 2589 if (RVLocs1[i].isRegLoc()) { 2590 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2591 return false; 2592 } else { 2593 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2594 return false; 2595 } 2596 } 2597 } 2598 2599 // If the callee takes no arguments then go on to check the results of the 2600 // call. 2601 if (!Outs.empty()) { 2602 // Check if stack adjustment is needed. For now, do not do this if any 2603 // argument is passed on the stack. 2604 SmallVector<CCValAssign, 16> ArgLocs; 2605 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2606 getTargetMachine(), ArgLocs, *DAG.getContext()); 2607 2608 // Allocate shadow area for Win64 2609 if (Subtarget->isTargetWin64()) { 2610 CCInfo.AllocateStack(32, 8); 2611 } 2612 2613 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2614 if (CCInfo.getNextStackOffset()) { 2615 MachineFunction &MF = DAG.getMachineFunction(); 2616 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2617 return false; 2618 2619 // Check if the arguments are already laid out in the right way as 2620 // the caller's fixed stack objects. 2621 MachineFrameInfo *MFI = MF.getFrameInfo(); 2622 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2623 const X86InstrInfo *TII = 2624 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2625 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2626 CCValAssign &VA = ArgLocs[i]; 2627 SDValue Arg = OutVals[i]; 2628 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2629 if (VA.getLocInfo() == CCValAssign::Indirect) 2630 return false; 2631 if (!VA.isRegLoc()) { 2632 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2633 MFI, MRI, TII)) 2634 return false; 2635 } 2636 } 2637 } 2638 2639 // If the tailcall address may be in a register, then make sure it's 2640 // possible to register allocate for it. In 32-bit, the call address can 2641 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2642 // callee-saved registers are restored. These happen to be the same 2643 // registers used to pass 'inreg' arguments so watch out for those. 2644 if (!Subtarget->is64Bit() && 2645 !isa<GlobalAddressSDNode>(Callee) && 2646 !isa<ExternalSymbolSDNode>(Callee)) { 2647 unsigned NumInRegs = 0; 2648 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2649 CCValAssign &VA = ArgLocs[i]; 2650 if (!VA.isRegLoc()) 2651 continue; 2652 unsigned Reg = VA.getLocReg(); 2653 switch (Reg) { 2654 default: break; 2655 case X86::EAX: case X86::EDX: case X86::ECX: 2656 if (++NumInRegs == 3) 2657 return false; 2658 break; 2659 } 2660 } 2661 } 2662 } 2663 2664 return true; 2665} 2666 2667FastISel * 2668X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2669 return X86::createFastISel(funcInfo); 2670} 2671 2672 2673//===----------------------------------------------------------------------===// 2674// Other Lowering Hooks 2675//===----------------------------------------------------------------------===// 2676 2677static bool MayFoldLoad(SDValue Op) { 2678 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2679} 2680 2681static bool MayFoldIntoStore(SDValue Op) { 2682 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2683} 2684 2685static bool isTargetShuffle(unsigned Opcode) { 2686 switch(Opcode) { 2687 default: return false; 2688 case X86ISD::PSHUFD: 2689 case X86ISD::PSHUFHW: 2690 case X86ISD::PSHUFLW: 2691 case X86ISD::SHUFPD: 2692 case X86ISD::PALIGN: 2693 case X86ISD::SHUFPS: 2694 case X86ISD::MOVLHPS: 2695 case X86ISD::MOVLHPD: 2696 case X86ISD::MOVHLPS: 2697 case X86ISD::MOVLPS: 2698 case X86ISD::MOVLPD: 2699 case X86ISD::MOVSHDUP: 2700 case X86ISD::MOVSLDUP: 2701 case X86ISD::MOVDDUP: 2702 case X86ISD::MOVSS: 2703 case X86ISD::MOVSD: 2704 case X86ISD::UNPCKLPS: 2705 case X86ISD::UNPCKLPD: 2706 case X86ISD::VUNPCKLPSY: 2707 case X86ISD::VUNPCKLPDY: 2708 case X86ISD::PUNPCKLWD: 2709 case X86ISD::PUNPCKLBW: 2710 case X86ISD::PUNPCKLDQ: 2711 case X86ISD::PUNPCKLQDQ: 2712 case X86ISD::UNPCKHPS: 2713 case X86ISD::UNPCKHPD: 2714 case X86ISD::VUNPCKHPSY: 2715 case X86ISD::VUNPCKHPDY: 2716 case X86ISD::PUNPCKHWD: 2717 case X86ISD::PUNPCKHBW: 2718 case X86ISD::PUNPCKHDQ: 2719 case X86ISD::PUNPCKHQDQ: 2720 case X86ISD::VPERMIL: 2721 return true; 2722 } 2723 return false; 2724} 2725 2726static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2727 SDValue V1, SelectionDAG &DAG) { 2728 switch(Opc) { 2729 default: llvm_unreachable("Unknown x86 shuffle node"); 2730 case X86ISD::MOVSHDUP: 2731 case X86ISD::MOVSLDUP: 2732 case X86ISD::MOVDDUP: 2733 return DAG.getNode(Opc, dl, VT, V1); 2734 } 2735 2736 return SDValue(); 2737} 2738 2739static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2740 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2741 switch(Opc) { 2742 default: llvm_unreachable("Unknown x86 shuffle node"); 2743 case X86ISD::PSHUFD: 2744 case X86ISD::PSHUFHW: 2745 case X86ISD::PSHUFLW: 2746 case X86ISD::VPERMIL: 2747 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2748 } 2749 2750 return SDValue(); 2751} 2752 2753static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2754 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2755 switch(Opc) { 2756 default: llvm_unreachable("Unknown x86 shuffle node"); 2757 case X86ISD::PALIGN: 2758 case X86ISD::SHUFPD: 2759 case X86ISD::SHUFPS: 2760 return DAG.getNode(Opc, dl, VT, V1, V2, 2761 DAG.getConstant(TargetMask, MVT::i8)); 2762 } 2763 return SDValue(); 2764} 2765 2766static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2767 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2768 switch(Opc) { 2769 default: llvm_unreachable("Unknown x86 shuffle node"); 2770 case X86ISD::MOVLHPS: 2771 case X86ISD::MOVLHPD: 2772 case X86ISD::MOVHLPS: 2773 case X86ISD::MOVLPS: 2774 case X86ISD::MOVLPD: 2775 case X86ISD::MOVSS: 2776 case X86ISD::MOVSD: 2777 case X86ISD::UNPCKLPS: 2778 case X86ISD::UNPCKLPD: 2779 case X86ISD::VUNPCKLPSY: 2780 case X86ISD::VUNPCKLPDY: 2781 case X86ISD::PUNPCKLWD: 2782 case X86ISD::PUNPCKLBW: 2783 case X86ISD::PUNPCKLDQ: 2784 case X86ISD::PUNPCKLQDQ: 2785 case X86ISD::UNPCKHPS: 2786 case X86ISD::UNPCKHPD: 2787 case X86ISD::VUNPCKHPSY: 2788 case X86ISD::VUNPCKHPDY: 2789 case X86ISD::PUNPCKHWD: 2790 case X86ISD::PUNPCKHBW: 2791 case X86ISD::PUNPCKHDQ: 2792 case X86ISD::PUNPCKHQDQ: 2793 return DAG.getNode(Opc, dl, VT, V1, V2); 2794 } 2795 return SDValue(); 2796} 2797 2798SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2799 MachineFunction &MF = DAG.getMachineFunction(); 2800 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2801 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2802 2803 if (ReturnAddrIndex == 0) { 2804 // Set up a frame object for the return address. 2805 uint64_t SlotSize = TD->getPointerSize(); 2806 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2807 false); 2808 FuncInfo->setRAIndex(ReturnAddrIndex); 2809 } 2810 2811 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2812} 2813 2814 2815bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2816 bool hasSymbolicDisplacement) { 2817 // Offset should fit into 32 bit immediate field. 2818 if (!isInt<32>(Offset)) 2819 return false; 2820 2821 // If we don't have a symbolic displacement - we don't have any extra 2822 // restrictions. 2823 if (!hasSymbolicDisplacement) 2824 return true; 2825 2826 // FIXME: Some tweaks might be needed for medium code model. 2827 if (M != CodeModel::Small && M != CodeModel::Kernel) 2828 return false; 2829 2830 // For small code model we assume that latest object is 16MB before end of 31 2831 // bits boundary. We may also accept pretty large negative constants knowing 2832 // that all objects are in the positive half of address space. 2833 if (M == CodeModel::Small && Offset < 16*1024*1024) 2834 return true; 2835 2836 // For kernel code model we know that all object resist in the negative half 2837 // of 32bits address space. We may not accept negative offsets, since they may 2838 // be just off and we may accept pretty large positive ones. 2839 if (M == CodeModel::Kernel && Offset > 0) 2840 return true; 2841 2842 return false; 2843} 2844 2845/// isCalleePop - Determines whether the callee is required to pop its 2846/// own arguments. Callee pop is necessary to support tail calls. 2847bool X86::isCalleePop(CallingConv::ID CallingConv, 2848 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 2849 if (IsVarArg) 2850 return false; 2851 2852 switch (CallingConv) { 2853 default: 2854 return false; 2855 case CallingConv::X86_StdCall: 2856 return !is64Bit; 2857 case CallingConv::X86_FastCall: 2858 return !is64Bit; 2859 case CallingConv::X86_ThisCall: 2860 return !is64Bit; 2861 case CallingConv::Fast: 2862 return TailCallOpt; 2863 case CallingConv::GHC: 2864 return TailCallOpt; 2865 } 2866} 2867 2868/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2869/// specific condition code, returning the condition code and the LHS/RHS of the 2870/// comparison to make. 2871static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2872 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2873 if (!isFP) { 2874 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2875 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2876 // X > -1 -> X == 0, jump !sign. 2877 RHS = DAG.getConstant(0, RHS.getValueType()); 2878 return X86::COND_NS; 2879 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2880 // X < 0 -> X == 0, jump on sign. 2881 return X86::COND_S; 2882 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2883 // X < 1 -> X <= 0 2884 RHS = DAG.getConstant(0, RHS.getValueType()); 2885 return X86::COND_LE; 2886 } 2887 } 2888 2889 switch (SetCCOpcode) { 2890 default: llvm_unreachable("Invalid integer condition!"); 2891 case ISD::SETEQ: return X86::COND_E; 2892 case ISD::SETGT: return X86::COND_G; 2893 case ISD::SETGE: return X86::COND_GE; 2894 case ISD::SETLT: return X86::COND_L; 2895 case ISD::SETLE: return X86::COND_LE; 2896 case ISD::SETNE: return X86::COND_NE; 2897 case ISD::SETULT: return X86::COND_B; 2898 case ISD::SETUGT: return X86::COND_A; 2899 case ISD::SETULE: return X86::COND_BE; 2900 case ISD::SETUGE: return X86::COND_AE; 2901 } 2902 } 2903 2904 // First determine if it is required or is profitable to flip the operands. 2905 2906 // If LHS is a foldable load, but RHS is not, flip the condition. 2907 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2908 !ISD::isNON_EXTLoad(RHS.getNode())) { 2909 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2910 std::swap(LHS, RHS); 2911 } 2912 2913 switch (SetCCOpcode) { 2914 default: break; 2915 case ISD::SETOLT: 2916 case ISD::SETOLE: 2917 case ISD::SETUGT: 2918 case ISD::SETUGE: 2919 std::swap(LHS, RHS); 2920 break; 2921 } 2922 2923 // On a floating point condition, the flags are set as follows: 2924 // ZF PF CF op 2925 // 0 | 0 | 0 | X > Y 2926 // 0 | 0 | 1 | X < Y 2927 // 1 | 0 | 0 | X == Y 2928 // 1 | 1 | 1 | unordered 2929 switch (SetCCOpcode) { 2930 default: llvm_unreachable("Condcode should be pre-legalized away"); 2931 case ISD::SETUEQ: 2932 case ISD::SETEQ: return X86::COND_E; 2933 case ISD::SETOLT: // flipped 2934 case ISD::SETOGT: 2935 case ISD::SETGT: return X86::COND_A; 2936 case ISD::SETOLE: // flipped 2937 case ISD::SETOGE: 2938 case ISD::SETGE: return X86::COND_AE; 2939 case ISD::SETUGT: // flipped 2940 case ISD::SETULT: 2941 case ISD::SETLT: return X86::COND_B; 2942 case ISD::SETUGE: // flipped 2943 case ISD::SETULE: 2944 case ISD::SETLE: return X86::COND_BE; 2945 case ISD::SETONE: 2946 case ISD::SETNE: return X86::COND_NE; 2947 case ISD::SETUO: return X86::COND_P; 2948 case ISD::SETO: return X86::COND_NP; 2949 case ISD::SETOEQ: 2950 case ISD::SETUNE: return X86::COND_INVALID; 2951 } 2952} 2953 2954/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2955/// code. Current x86 isa includes the following FP cmov instructions: 2956/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2957static bool hasFPCMov(unsigned X86CC) { 2958 switch (X86CC) { 2959 default: 2960 return false; 2961 case X86::COND_B: 2962 case X86::COND_BE: 2963 case X86::COND_E: 2964 case X86::COND_P: 2965 case X86::COND_A: 2966 case X86::COND_AE: 2967 case X86::COND_NE: 2968 case X86::COND_NP: 2969 return true; 2970 } 2971} 2972 2973/// isFPImmLegal - Returns true if the target can instruction select the 2974/// specified FP immediate natively. If false, the legalizer will 2975/// materialize the FP immediate as a load from a constant pool. 2976bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2977 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2978 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2979 return true; 2980 } 2981 return false; 2982} 2983 2984/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2985/// the specified range (L, H]. 2986static bool isUndefOrInRange(int Val, int Low, int Hi) { 2987 return (Val < 0) || (Val >= Low && Val < Hi); 2988} 2989 2990/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2991/// specified value. 2992static bool isUndefOrEqual(int Val, int CmpVal) { 2993 if (Val < 0 || Val == CmpVal) 2994 return true; 2995 return false; 2996} 2997 2998/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2999/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3000/// the second operand. 3001static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3002 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3003 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3004 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3005 return (Mask[0] < 2 && Mask[1] < 2); 3006 return false; 3007} 3008 3009bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3010 SmallVector<int, 8> M; 3011 N->getMask(M); 3012 return ::isPSHUFDMask(M, N->getValueType(0)); 3013} 3014 3015/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3016/// is suitable for input to PSHUFHW. 3017static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3018 if (VT != MVT::v8i16) 3019 return false; 3020 3021 // Lower quadword copied in order or undef. 3022 for (int i = 0; i != 4; ++i) 3023 if (Mask[i] >= 0 && Mask[i] != i) 3024 return false; 3025 3026 // Upper quadword shuffled. 3027 for (int i = 4; i != 8; ++i) 3028 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3029 return false; 3030 3031 return true; 3032} 3033 3034bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3035 SmallVector<int, 8> M; 3036 N->getMask(M); 3037 return ::isPSHUFHWMask(M, N->getValueType(0)); 3038} 3039 3040/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3041/// is suitable for input to PSHUFLW. 3042static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3043 if (VT != MVT::v8i16) 3044 return false; 3045 3046 // Upper quadword copied in order. 3047 for (int i = 4; i != 8; ++i) 3048 if (Mask[i] >= 0 && Mask[i] != i) 3049 return false; 3050 3051 // Lower quadword shuffled. 3052 for (int i = 0; i != 4; ++i) 3053 if (Mask[i] >= 4) 3054 return false; 3055 3056 return true; 3057} 3058 3059bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3060 SmallVector<int, 8> M; 3061 N->getMask(M); 3062 return ::isPSHUFLWMask(M, N->getValueType(0)); 3063} 3064 3065/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3066/// is suitable for input to PALIGNR. 3067static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3068 bool hasSSSE3) { 3069 int i, e = VT.getVectorNumElements(); 3070 3071 // Do not handle v2i64 / v2f64 shuffles with palignr. 3072 if (e < 4 || !hasSSSE3) 3073 return false; 3074 3075 for (i = 0; i != e; ++i) 3076 if (Mask[i] >= 0) 3077 break; 3078 3079 // All undef, not a palignr. 3080 if (i == e) 3081 return false; 3082 3083 // Make sure we're shifting in the right direction. 3084 if (Mask[i] <= i) 3085 return false; 3086 3087 int s = Mask[i] - i; 3088 3089 // Check the rest of the elements to see if they are consecutive. 3090 for (++i; i != e; ++i) { 3091 int m = Mask[i]; 3092 if (m >= 0 && m != s+i) 3093 return false; 3094 } 3095 return true; 3096} 3097 3098bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 3099 SmallVector<int, 8> M; 3100 N->getMask(M); 3101 return ::isPALIGNRMask(M, N->getValueType(0), true); 3102} 3103 3104/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3105/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3106static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3107 int NumElems = VT.getVectorNumElements(); 3108 if (NumElems != 2 && NumElems != 4) 3109 return false; 3110 3111 int Half = NumElems / 2; 3112 for (int i = 0; i < Half; ++i) 3113 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3114 return false; 3115 for (int i = Half; i < NumElems; ++i) 3116 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3117 return false; 3118 3119 return true; 3120} 3121 3122bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3123 SmallVector<int, 8> M; 3124 N->getMask(M); 3125 return ::isSHUFPMask(M, N->getValueType(0)); 3126} 3127 3128/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3129/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3130/// half elements to come from vector 1 (which would equal the dest.) and 3131/// the upper half to come from vector 2. 3132static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3133 int NumElems = VT.getVectorNumElements(); 3134 3135 if (NumElems != 2 && NumElems != 4) 3136 return false; 3137 3138 int Half = NumElems / 2; 3139 for (int i = 0; i < Half; ++i) 3140 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3141 return false; 3142 for (int i = Half; i < NumElems; ++i) 3143 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3144 return false; 3145 return true; 3146} 3147 3148static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3149 SmallVector<int, 8> M; 3150 N->getMask(M); 3151 return isCommutedSHUFPMask(M, N->getValueType(0)); 3152} 3153 3154/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3155/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3156bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3157 if (N->getValueType(0).getVectorNumElements() != 4) 3158 return false; 3159 3160 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3161 return isUndefOrEqual(N->getMaskElt(0), 6) && 3162 isUndefOrEqual(N->getMaskElt(1), 7) && 3163 isUndefOrEqual(N->getMaskElt(2), 2) && 3164 isUndefOrEqual(N->getMaskElt(3), 3); 3165} 3166 3167/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3168/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3169/// <2, 3, 2, 3> 3170bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3171 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3172 3173 if (NumElems != 4) 3174 return false; 3175 3176 return isUndefOrEqual(N->getMaskElt(0), 2) && 3177 isUndefOrEqual(N->getMaskElt(1), 3) && 3178 isUndefOrEqual(N->getMaskElt(2), 2) && 3179 isUndefOrEqual(N->getMaskElt(3), 3); 3180} 3181 3182/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3183/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3184bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3185 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3186 3187 if (NumElems != 2 && NumElems != 4) 3188 return false; 3189 3190 for (unsigned i = 0; i < NumElems/2; ++i) 3191 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3192 return false; 3193 3194 for (unsigned i = NumElems/2; i < NumElems; ++i) 3195 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3196 return false; 3197 3198 return true; 3199} 3200 3201/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3202/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3203bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3204 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3205 3206 if ((NumElems != 2 && NumElems != 4) 3207 || N->getValueType(0).getSizeInBits() > 128) 3208 return false; 3209 3210 for (unsigned i = 0; i < NumElems/2; ++i) 3211 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3212 return false; 3213 3214 for (unsigned i = 0; i < NumElems/2; ++i) 3215 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3216 return false; 3217 3218 return true; 3219} 3220 3221/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3222/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3223static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3224 bool V2IsSplat = false) { 3225 int NumElts = VT.getVectorNumElements(); 3226 3227 assert((VT.is128BitVector() || VT.is256BitVector()) && 3228 "Unsupported vector type for unpckh"); 3229 3230 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3231 return false; 3232 3233 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3234 // independently on 128-bit lanes. 3235 unsigned NumLanes = VT.getSizeInBits()/128; 3236 unsigned NumLaneElts = NumElts/NumLanes; 3237 3238 unsigned Start = 0; 3239 unsigned End = NumLaneElts; 3240 for (unsigned s = 0; s < NumLanes; ++s) { 3241 for (unsigned i = Start, j = s * NumLaneElts; 3242 i != End; 3243 i += 2, ++j) { 3244 int BitI = Mask[i]; 3245 int BitI1 = Mask[i+1]; 3246 if (!isUndefOrEqual(BitI, j)) 3247 return false; 3248 if (V2IsSplat) { 3249 if (!isUndefOrEqual(BitI1, NumElts)) 3250 return false; 3251 } else { 3252 if (!isUndefOrEqual(BitI1, j + NumElts)) 3253 return false; 3254 } 3255 } 3256 // Process the next 128 bits. 3257 Start += NumLaneElts; 3258 End += NumLaneElts; 3259 } 3260 3261 return true; 3262} 3263 3264bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3265 SmallVector<int, 8> M; 3266 N->getMask(M); 3267 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3268} 3269 3270/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3271/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3272static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3273 bool V2IsSplat = false) { 3274 int NumElts = VT.getVectorNumElements(); 3275 3276 assert((VT.is128BitVector() || VT.is256BitVector()) && 3277 "Unsupported vector type for unpckh"); 3278 3279 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3280 return false; 3281 3282 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3283 // independently on 128-bit lanes. 3284 unsigned NumLanes = VT.getSizeInBits()/128; 3285 unsigned NumLaneElts = NumElts/NumLanes; 3286 3287 unsigned Start = 0; 3288 unsigned End = NumLaneElts; 3289 for (unsigned l = 0; l != NumLanes; ++l) { 3290 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; 3291 i != End; i += 2, ++j) { 3292 int BitI = Mask[i]; 3293 int BitI1 = Mask[i+1]; 3294 if (!isUndefOrEqual(BitI, j)) 3295 return false; 3296 if (V2IsSplat) { 3297 if (isUndefOrEqual(BitI1, NumElts)) 3298 return false; 3299 } else { 3300 if (!isUndefOrEqual(BitI1, j+NumElts)) 3301 return false; 3302 } 3303 } 3304 // Process the next 128 bits. 3305 Start += NumLaneElts; 3306 End += NumLaneElts; 3307 } 3308 return true; 3309} 3310 3311bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3312 SmallVector<int, 8> M; 3313 N->getMask(M); 3314 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3315} 3316 3317/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3318/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3319/// <0, 0, 1, 1> 3320static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3321 int NumElems = VT.getVectorNumElements(); 3322 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3323 return false; 3324 3325 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3326 // independently on 128-bit lanes. 3327 unsigned NumLanes = VT.getSizeInBits() / 128; 3328 unsigned NumLaneElts = NumElems / NumLanes; 3329 3330 for (unsigned s = 0; s < NumLanes; ++s) { 3331 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; 3332 i != NumLaneElts * (s + 1); 3333 i += 2, ++j) { 3334 int BitI = Mask[i]; 3335 int BitI1 = Mask[i+1]; 3336 3337 if (!isUndefOrEqual(BitI, j)) 3338 return false; 3339 if (!isUndefOrEqual(BitI1, j)) 3340 return false; 3341 } 3342 } 3343 3344 return true; 3345} 3346 3347bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3348 SmallVector<int, 8> M; 3349 N->getMask(M); 3350 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3351} 3352 3353/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3354/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3355/// <2, 2, 3, 3> 3356static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3357 int NumElems = VT.getVectorNumElements(); 3358 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3359 return false; 3360 3361 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3362 int BitI = Mask[i]; 3363 int BitI1 = Mask[i+1]; 3364 if (!isUndefOrEqual(BitI, j)) 3365 return false; 3366 if (!isUndefOrEqual(BitI1, j)) 3367 return false; 3368 } 3369 return true; 3370} 3371 3372bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3373 SmallVector<int, 8> M; 3374 N->getMask(M); 3375 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3376} 3377 3378/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3379/// specifies a shuffle of elements that is suitable for input to MOVSS, 3380/// MOVSD, and MOVD, i.e. setting the lowest element. 3381static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3382 if (VT.getVectorElementType().getSizeInBits() < 32) 3383 return false; 3384 3385 int NumElts = VT.getVectorNumElements(); 3386 3387 if (!isUndefOrEqual(Mask[0], NumElts)) 3388 return false; 3389 3390 for (int i = 1; i < NumElts; ++i) 3391 if (!isUndefOrEqual(Mask[i], i)) 3392 return false; 3393 3394 return true; 3395} 3396 3397bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3398 SmallVector<int, 8> M; 3399 N->getMask(M); 3400 return ::isMOVLMask(M, N->getValueType(0)); 3401} 3402 3403/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand 3404/// specifies a shuffle of elements that is suitable for input to VPERMIL*. 3405static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3406 unsigned NumElts = VT.getVectorNumElements(); 3407 unsigned NumLanes = VT.getSizeInBits()/128; 3408 3409 // Match any permutation of 128-bit vector with 32/64-bit types 3410 if (NumLanes == 1) { 3411 if (NumElts == 4 || NumElts == 2) 3412 return true; 3413 return false; 3414 } 3415 3416 // Only match 256-bit with 32/64-bit types 3417 if (NumElts != 8 && NumElts != 4) 3418 return false; 3419 3420 // The mask on the high lane should be the same as the low. Actually, 3421 // they can differ if any of the corresponding index in a lane is undef. 3422 int LaneSize = NumElts/NumLanes; 3423 for (int i = 0; i < LaneSize; ++i) { 3424 int HighElt = i+LaneSize; 3425 if (Mask[i] < 0 || Mask[HighElt] < 0) 3426 continue; 3427 3428 if (Mask[HighElt]-Mask[i] != LaneSize) 3429 return false; 3430 } 3431 3432 return true; 3433} 3434 3435/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle 3436/// the specified VECTOR_MASK mask with VPERMIL* instructions. 3437static unsigned getShuffleVPERMILImmediate(SDNode *N) { 3438 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3439 EVT VT = SVOp->getValueType(0); 3440 3441 int NumElts = VT.getVectorNumElements(); 3442 int NumLanes = VT.getSizeInBits()/128; 3443 3444 unsigned Mask = 0; 3445 for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i) 3446 Mask |= SVOp->getMaskElt(i) << (i*2); 3447 3448 return Mask; 3449} 3450 3451/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3452/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3453/// element of vector 2 and the other elements to come from vector 1 in order. 3454static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3455 bool V2IsSplat = false, bool V2IsUndef = false) { 3456 int NumOps = VT.getVectorNumElements(); 3457 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3458 return false; 3459 3460 if (!isUndefOrEqual(Mask[0], 0)) 3461 return false; 3462 3463 for (int i = 1; i < NumOps; ++i) 3464 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3465 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3466 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3467 return false; 3468 3469 return true; 3470} 3471 3472static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3473 bool V2IsUndef = false) { 3474 SmallVector<int, 8> M; 3475 N->getMask(M); 3476 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3477} 3478 3479/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3480/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3481/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3482bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3483 const X86Subtarget *Subtarget) { 3484 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3485 return false; 3486 3487 // The second vector must be undef 3488 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3489 return false; 3490 3491 EVT VT = N->getValueType(0); 3492 unsigned NumElems = VT.getVectorNumElements(); 3493 3494 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3495 (VT.getSizeInBits() == 256 && NumElems != 8)) 3496 return false; 3497 3498 // "i+1" is the value the indexed mask element must have 3499 for (unsigned i = 0; i < NumElems; i += 2) 3500 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3501 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3502 return false; 3503 3504 return true; 3505} 3506 3507/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3508/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3509/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3510bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3511 const X86Subtarget *Subtarget) { 3512 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3513 return false; 3514 3515 // The second vector must be undef 3516 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3517 return false; 3518 3519 EVT VT = N->getValueType(0); 3520 unsigned NumElems = VT.getVectorNumElements(); 3521 3522 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3523 (VT.getSizeInBits() == 256 && NumElems != 8)) 3524 return false; 3525 3526 // "i" is the value the indexed mask element must have 3527 for (unsigned i = 0; i < NumElems; i += 2) 3528 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3529 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3530 return false; 3531 3532 return true; 3533} 3534 3535/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3536/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3537bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3538 int e = N->getValueType(0).getVectorNumElements() / 2; 3539 3540 for (int i = 0; i < e; ++i) 3541 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3542 return false; 3543 for (int i = 0; i < e; ++i) 3544 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3545 return false; 3546 return true; 3547} 3548 3549/// isVEXTRACTF128Index - Return true if the specified 3550/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3551/// suitable for input to VEXTRACTF128. 3552bool X86::isVEXTRACTF128Index(SDNode *N) { 3553 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3554 return false; 3555 3556 // The index should be aligned on a 128-bit boundary. 3557 uint64_t Index = 3558 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3559 3560 unsigned VL = N->getValueType(0).getVectorNumElements(); 3561 unsigned VBits = N->getValueType(0).getSizeInBits(); 3562 unsigned ElSize = VBits / VL; 3563 bool Result = (Index * ElSize) % 128 == 0; 3564 3565 return Result; 3566} 3567 3568/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3569/// operand specifies a subvector insert that is suitable for input to 3570/// VINSERTF128. 3571bool X86::isVINSERTF128Index(SDNode *N) { 3572 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3573 return false; 3574 3575 // The index should be aligned on a 128-bit boundary. 3576 uint64_t Index = 3577 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3578 3579 unsigned VL = N->getValueType(0).getVectorNumElements(); 3580 unsigned VBits = N->getValueType(0).getSizeInBits(); 3581 unsigned ElSize = VBits / VL; 3582 bool Result = (Index * ElSize) % 128 == 0; 3583 3584 return Result; 3585} 3586 3587/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3588/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3589unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3590 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3591 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3592 3593 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3594 unsigned Mask = 0; 3595 for (int i = 0; i < NumOperands; ++i) { 3596 int Val = SVOp->getMaskElt(NumOperands-i-1); 3597 if (Val < 0) Val = 0; 3598 if (Val >= NumOperands) Val -= NumOperands; 3599 Mask |= Val; 3600 if (i != NumOperands - 1) 3601 Mask <<= Shift; 3602 } 3603 return Mask; 3604} 3605 3606/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3607/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3608unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3609 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3610 unsigned Mask = 0; 3611 // 8 nodes, but we only care about the last 4. 3612 for (unsigned i = 7; i >= 4; --i) { 3613 int Val = SVOp->getMaskElt(i); 3614 if (Val >= 0) 3615 Mask |= (Val - 4); 3616 if (i != 4) 3617 Mask <<= 2; 3618 } 3619 return Mask; 3620} 3621 3622/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3623/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3624unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3625 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3626 unsigned Mask = 0; 3627 // 8 nodes, but we only care about the first 4. 3628 for (int i = 3; i >= 0; --i) { 3629 int Val = SVOp->getMaskElt(i); 3630 if (Val >= 0) 3631 Mask |= Val; 3632 if (i != 0) 3633 Mask <<= 2; 3634 } 3635 return Mask; 3636} 3637 3638/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3639/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3640unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3641 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3642 EVT VVT = N->getValueType(0); 3643 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3644 int Val = 0; 3645 3646 unsigned i, e; 3647 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3648 Val = SVOp->getMaskElt(i); 3649 if (Val >= 0) 3650 break; 3651 } 3652 assert(Val - i > 0 && "PALIGNR imm should be positive"); 3653 return (Val - i) * EltSize; 3654} 3655 3656/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3657/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3658/// instructions. 3659unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3660 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3661 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3662 3663 uint64_t Index = 3664 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3665 3666 EVT VecVT = N->getOperand(0).getValueType(); 3667 EVT ElVT = VecVT.getVectorElementType(); 3668 3669 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3670 return Index / NumElemsPerChunk; 3671} 3672 3673/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3674/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3675/// instructions. 3676unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3677 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3678 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3679 3680 uint64_t Index = 3681 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3682 3683 EVT VecVT = N->getValueType(0); 3684 EVT ElVT = VecVT.getVectorElementType(); 3685 3686 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3687 return Index / NumElemsPerChunk; 3688} 3689 3690/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3691/// constant +0.0. 3692bool X86::isZeroNode(SDValue Elt) { 3693 return ((isa<ConstantSDNode>(Elt) && 3694 cast<ConstantSDNode>(Elt)->isNullValue()) || 3695 (isa<ConstantFPSDNode>(Elt) && 3696 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3697} 3698 3699/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3700/// their permute mask. 3701static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3702 SelectionDAG &DAG) { 3703 EVT VT = SVOp->getValueType(0); 3704 unsigned NumElems = VT.getVectorNumElements(); 3705 SmallVector<int, 8> MaskVec; 3706 3707 for (unsigned i = 0; i != NumElems; ++i) { 3708 int idx = SVOp->getMaskElt(i); 3709 if (idx < 0) 3710 MaskVec.push_back(idx); 3711 else if (idx < (int)NumElems) 3712 MaskVec.push_back(idx + NumElems); 3713 else 3714 MaskVec.push_back(idx - NumElems); 3715 } 3716 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3717 SVOp->getOperand(0), &MaskVec[0]); 3718} 3719 3720/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3721/// the two vector operands have swapped position. 3722static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3723 unsigned NumElems = VT.getVectorNumElements(); 3724 for (unsigned i = 0; i != NumElems; ++i) { 3725 int idx = Mask[i]; 3726 if (idx < 0) 3727 continue; 3728 else if (idx < (int)NumElems) 3729 Mask[i] = idx + NumElems; 3730 else 3731 Mask[i] = idx - NumElems; 3732 } 3733} 3734 3735/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3736/// match movhlps. The lower half elements should come from upper half of 3737/// V1 (and in order), and the upper half elements should come from the upper 3738/// half of V2 (and in order). 3739static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3740 if (Op->getValueType(0).getVectorNumElements() != 4) 3741 return false; 3742 for (unsigned i = 0, e = 2; i != e; ++i) 3743 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3744 return false; 3745 for (unsigned i = 2; i != 4; ++i) 3746 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3747 return false; 3748 return true; 3749} 3750 3751/// isScalarLoadToVector - Returns true if the node is a scalar load that 3752/// is promoted to a vector. It also returns the LoadSDNode by reference if 3753/// required. 3754static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3755 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3756 return false; 3757 N = N->getOperand(0).getNode(); 3758 if (!ISD::isNON_EXTLoad(N)) 3759 return false; 3760 if (LD) 3761 *LD = cast<LoadSDNode>(N); 3762 return true; 3763} 3764 3765/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3766/// match movlp{s|d}. The lower half elements should come from lower half of 3767/// V1 (and in order), and the upper half elements should come from the upper 3768/// half of V2 (and in order). And since V1 will become the source of the 3769/// MOVLP, it must be either a vector load or a scalar load to vector. 3770static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3771 ShuffleVectorSDNode *Op) { 3772 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3773 return false; 3774 // Is V2 is a vector load, don't do this transformation. We will try to use 3775 // load folding shufps op. 3776 if (ISD::isNON_EXTLoad(V2)) 3777 return false; 3778 3779 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3780 3781 if (NumElems != 2 && NumElems != 4) 3782 return false; 3783 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3784 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3785 return false; 3786 for (unsigned i = NumElems/2; i != NumElems; ++i) 3787 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3788 return false; 3789 return true; 3790} 3791 3792/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3793/// all the same. 3794static bool isSplatVector(SDNode *N) { 3795 if (N->getOpcode() != ISD::BUILD_VECTOR) 3796 return false; 3797 3798 SDValue SplatValue = N->getOperand(0); 3799 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3800 if (N->getOperand(i) != SplatValue) 3801 return false; 3802 return true; 3803} 3804 3805/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3806/// to an zero vector. 3807/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3808static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3809 SDValue V1 = N->getOperand(0); 3810 SDValue V2 = N->getOperand(1); 3811 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3812 for (unsigned i = 0; i != NumElems; ++i) { 3813 int Idx = N->getMaskElt(i); 3814 if (Idx >= (int)NumElems) { 3815 unsigned Opc = V2.getOpcode(); 3816 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3817 continue; 3818 if (Opc != ISD::BUILD_VECTOR || 3819 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3820 return false; 3821 } else if (Idx >= 0) { 3822 unsigned Opc = V1.getOpcode(); 3823 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3824 continue; 3825 if (Opc != ISD::BUILD_VECTOR || 3826 !X86::isZeroNode(V1.getOperand(Idx))) 3827 return false; 3828 } 3829 } 3830 return true; 3831} 3832 3833/// getZeroVector - Returns a vector of specified type with all zero elements. 3834/// 3835static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3836 DebugLoc dl) { 3837 assert(VT.isVector() && "Expected a vector type"); 3838 3839 // Always build SSE zero vectors as <4 x i32> bitcasted 3840 // to their dest type. This ensures they get CSE'd. 3841 SDValue Vec; 3842 if (VT.getSizeInBits() == 128) { // SSE 3843 if (HasSSE2) { // SSE2 3844 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3845 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3846 } else { // SSE1 3847 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3848 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3849 } 3850 } else if (VT.getSizeInBits() == 256) { // AVX 3851 // 256-bit logic and arithmetic instructions in AVX are 3852 // all floating-point, no support for integer ops. Default 3853 // to emitting fp zeroed vectors then. 3854 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3855 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3856 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3857 } 3858 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3859} 3860 3861/// getOnesVector - Returns a vector of specified type with all bits set. 3862/// Always build ones vectors as <4 x i32>. For 256-bit types, use two 3863/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their 3864/// original type, ensuring they get CSE'd. 3865static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3866 assert(VT.isVector() && "Expected a vector type"); 3867 assert((VT.is128BitVector() || VT.is256BitVector()) 3868 && "Expected a 128-bit or 256-bit vector type"); 3869 3870 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3871 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 3872 Cst, Cst, Cst, Cst); 3873 3874 if (VT.is256BitVector()) { 3875 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 3876 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 3877 Vec = Insert128BitVector(InsV, Vec, 3878 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 3879 } 3880 3881 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3882} 3883 3884/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3885/// that point to V2 points to its first element. 3886static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3887 EVT VT = SVOp->getValueType(0); 3888 unsigned NumElems = VT.getVectorNumElements(); 3889 3890 bool Changed = false; 3891 SmallVector<int, 8> MaskVec; 3892 SVOp->getMask(MaskVec); 3893 3894 for (unsigned i = 0; i != NumElems; ++i) { 3895 if (MaskVec[i] > (int)NumElems) { 3896 MaskVec[i] = NumElems; 3897 Changed = true; 3898 } 3899 } 3900 if (Changed) 3901 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3902 SVOp->getOperand(1), &MaskVec[0]); 3903 return SDValue(SVOp, 0); 3904} 3905 3906/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3907/// operation of specified width. 3908static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3909 SDValue V2) { 3910 unsigned NumElems = VT.getVectorNumElements(); 3911 SmallVector<int, 8> Mask; 3912 Mask.push_back(NumElems); 3913 for (unsigned i = 1; i != NumElems; ++i) 3914 Mask.push_back(i); 3915 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3916} 3917 3918/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3919static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3920 SDValue V2) { 3921 unsigned NumElems = VT.getVectorNumElements(); 3922 SmallVector<int, 8> Mask; 3923 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3924 Mask.push_back(i); 3925 Mask.push_back(i + NumElems); 3926 } 3927 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3928} 3929 3930/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 3931static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3932 SDValue V2) { 3933 unsigned NumElems = VT.getVectorNumElements(); 3934 unsigned Half = NumElems/2; 3935 SmallVector<int, 8> Mask; 3936 for (unsigned i = 0; i != Half; ++i) { 3937 Mask.push_back(i + Half); 3938 Mask.push_back(i + NumElems + Half); 3939 } 3940 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3941} 3942 3943// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by 3944// a generic shuffle instruction because the target has no such instructions. 3945// Generate shuffles which repeat i16 and i8 several times until they can be 3946// represented by v4f32 and then be manipulated by target suported shuffles. 3947static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) { 3948 EVT VT = V.getValueType(); 3949 int NumElems = VT.getVectorNumElements(); 3950 DebugLoc dl = V.getDebugLoc(); 3951 3952 while (NumElems > 4) { 3953 if (EltNo < NumElems/2) { 3954 V = getUnpackl(DAG, dl, VT, V, V); 3955 } else { 3956 V = getUnpackh(DAG, dl, VT, V, V); 3957 EltNo -= NumElems/2; 3958 } 3959 NumElems >>= 1; 3960 } 3961 return V; 3962} 3963 3964/// getLegalSplat - Generate a legal splat with supported x86 shuffles 3965static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 3966 EVT VT = V.getValueType(); 3967 DebugLoc dl = V.getDebugLoc(); 3968 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 3969 && "Vector size not supported"); 3970 3971 bool Is128 = VT.getSizeInBits() == 128; 3972 EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32; 3973 V = DAG.getNode(ISD::BITCAST, dl, NVT, V); 3974 3975 if (Is128) { 3976 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3977 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 3978 } else { 3979 // The second half of indicies refer to the higher part, which is a 3980 // duplication of the lower one. This makes this shuffle a perfect match 3981 // for the VPERM instruction. 3982 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 3983 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 3984 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 3985 } 3986 3987 return DAG.getNode(ISD::BITCAST, dl, VT, V); 3988} 3989 3990/// PromoteVectorToScalarSplat - Since there's no native support for 3991/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector + 3992/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the 3993/// shuffle before the insertion, this yields less instructions in the end. 3994static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV, 3995 SelectionDAG &DAG) { 3996 EVT SrcVT = SV->getValueType(0); 3997 SDValue V1 = SV->getOperand(0); 3998 DebugLoc dl = SV->getDebugLoc(); 3999 int NumElems = SrcVT.getVectorNumElements(); 4000 4001 assert(SrcVT.is256BitVector() && "unknown howto handle vector type"); 4002 4003 SmallVector<int, 4> Mask; 4004 for (int i = 0; i < NumElems/2; ++i) 4005 Mask.push_back(SV->getMaskElt(i)); 4006 4007 EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 4008 NumElems/2); 4009 SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1), 4010 DAG.getUNDEF(SVT), &Mask[0]); 4011 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1, 4012 DAG.getConstant(0, MVT::i32), DAG, dl); 4013 4014 return Insert128BitVector(InsV, SV1, 4015 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4016} 4017 4018/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and 4019/// v8i32, v16i16 or v32i8 to v8f32. 4020static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4021 EVT SrcVT = SV->getValueType(0); 4022 SDValue V1 = SV->getOperand(0); 4023 DebugLoc dl = SV->getDebugLoc(); 4024 4025 int EltNo = SV->getSplatIndex(); 4026 int NumElems = SrcVT.getVectorNumElements(); 4027 unsigned Size = SrcVT.getSizeInBits(); 4028 4029 // Extract the 128-bit part containing the splat element and update 4030 // the splat element index when it refers to the higher register. 4031 if (Size == 256) { 4032 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4033 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4034 if (Idx > 0) 4035 EltNo -= NumElems/2; 4036 } 4037 4038 // Make this 128-bit vector duplicate i8 and i16 elements 4039 if (NumElems > 4) 4040 V1 = PromoteSplatv8v16(V1, DAG, EltNo); 4041 4042 // Recreate the 256-bit vector and place the same 128-bit vector 4043 // into the low and high part. This is necessary because we want 4044 // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles 4045 // inside each separate v4f32 lane. 4046 if (Size == 256) { 4047 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4048 DAG.getConstant(0, MVT::i32), DAG, dl); 4049 V1 = Insert128BitVector(InsV, V1, 4050 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4051 } 4052 4053 return getLegalSplat(DAG, V1, EltNo); 4054} 4055 4056/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4057/// vector of zero or undef vector. This produces a shuffle where the low 4058/// element of V2 is swizzled into the zero/undef vector, landing at element 4059/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4060static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4061 bool isZero, bool HasSSE2, 4062 SelectionDAG &DAG) { 4063 EVT VT = V2.getValueType(); 4064 SDValue V1 = isZero 4065 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4066 unsigned NumElems = VT.getVectorNumElements(); 4067 SmallVector<int, 16> MaskVec; 4068 for (unsigned i = 0; i != NumElems; ++i) 4069 // If this is the insertion idx, put the low elt of V2 here. 4070 MaskVec.push_back(i == Idx ? NumElems : i); 4071 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4072} 4073 4074/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4075/// element of the result of the vector shuffle. 4076static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4077 unsigned Depth) { 4078 if (Depth == 6) 4079 return SDValue(); // Limit search depth. 4080 4081 SDValue V = SDValue(N, 0); 4082 EVT VT = V.getValueType(); 4083 unsigned Opcode = V.getOpcode(); 4084 4085 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4086 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4087 Index = SV->getMaskElt(Index); 4088 4089 if (Index < 0) 4090 return DAG.getUNDEF(VT.getVectorElementType()); 4091 4092 int NumElems = VT.getVectorNumElements(); 4093 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4094 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4095 } 4096 4097 // Recurse into target specific vector shuffles to find scalars. 4098 if (isTargetShuffle(Opcode)) { 4099 int NumElems = VT.getVectorNumElements(); 4100 SmallVector<unsigned, 16> ShuffleMask; 4101 SDValue ImmN; 4102 4103 switch(Opcode) { 4104 case X86ISD::SHUFPS: 4105 case X86ISD::SHUFPD: 4106 ImmN = N->getOperand(N->getNumOperands()-1); 4107 DecodeSHUFPSMask(NumElems, 4108 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4109 ShuffleMask); 4110 break; 4111 case X86ISD::PUNPCKHBW: 4112 case X86ISD::PUNPCKHWD: 4113 case X86ISD::PUNPCKHDQ: 4114 case X86ISD::PUNPCKHQDQ: 4115 DecodePUNPCKHMask(NumElems, ShuffleMask); 4116 break; 4117 case X86ISD::UNPCKHPS: 4118 case X86ISD::UNPCKHPD: 4119 case X86ISD::VUNPCKHPSY: 4120 case X86ISD::VUNPCKHPDY: 4121 DecodeUNPCKHPMask(NumElems, ShuffleMask); 4122 break; 4123 case X86ISD::PUNPCKLBW: 4124 case X86ISD::PUNPCKLWD: 4125 case X86ISD::PUNPCKLDQ: 4126 case X86ISD::PUNPCKLQDQ: 4127 DecodePUNPCKLMask(VT, ShuffleMask); 4128 break; 4129 case X86ISD::UNPCKLPS: 4130 case X86ISD::UNPCKLPD: 4131 case X86ISD::VUNPCKLPSY: 4132 case X86ISD::VUNPCKLPDY: 4133 DecodeUNPCKLPMask(VT, ShuffleMask); 4134 break; 4135 case X86ISD::MOVHLPS: 4136 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4137 break; 4138 case X86ISD::MOVLHPS: 4139 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4140 break; 4141 case X86ISD::PSHUFD: 4142 ImmN = N->getOperand(N->getNumOperands()-1); 4143 DecodePSHUFMask(NumElems, 4144 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4145 ShuffleMask); 4146 break; 4147 case X86ISD::PSHUFHW: 4148 ImmN = N->getOperand(N->getNumOperands()-1); 4149 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4150 ShuffleMask); 4151 break; 4152 case X86ISD::PSHUFLW: 4153 ImmN = N->getOperand(N->getNumOperands()-1); 4154 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4155 ShuffleMask); 4156 break; 4157 case X86ISD::MOVSS: 4158 case X86ISD::MOVSD: { 4159 // The index 0 always comes from the first element of the second source, 4160 // this is why MOVSS and MOVSD are used in the first place. The other 4161 // elements come from the other positions of the first source vector. 4162 unsigned OpNum = (Index == 0) ? 1 : 0; 4163 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4164 Depth+1); 4165 } 4166 case X86ISD::VPERMIL: 4167 ImmN = N->getOperand(N->getNumOperands()-1); 4168 DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4169 ShuffleMask); 4170 default: 4171 assert("not implemented for target shuffle node"); 4172 return SDValue(); 4173 } 4174 4175 Index = ShuffleMask[Index]; 4176 if (Index < 0) 4177 return DAG.getUNDEF(VT.getVectorElementType()); 4178 4179 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4180 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4181 Depth+1); 4182 } 4183 4184 // Actual nodes that may contain scalar elements 4185 if (Opcode == ISD::BITCAST) { 4186 V = V.getOperand(0); 4187 EVT SrcVT = V.getValueType(); 4188 unsigned NumElems = VT.getVectorNumElements(); 4189 4190 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4191 return SDValue(); 4192 } 4193 4194 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4195 return (Index == 0) ? V.getOperand(0) 4196 : DAG.getUNDEF(VT.getVectorElementType()); 4197 4198 if (V.getOpcode() == ISD::BUILD_VECTOR) 4199 return V.getOperand(Index); 4200 4201 return SDValue(); 4202} 4203 4204/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4205/// shuffle operation which come from a consecutively from a zero. The 4206/// search can start in two different directions, from left or right. 4207static 4208unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4209 bool ZerosFromLeft, SelectionDAG &DAG) { 4210 int i = 0; 4211 4212 while (i < NumElems) { 4213 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4214 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4215 if (!(Elt.getNode() && 4216 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4217 break; 4218 ++i; 4219 } 4220 4221 return i; 4222} 4223 4224/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4225/// MaskE correspond consecutively to elements from one of the vector operands, 4226/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4227static 4228bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4229 int OpIdx, int NumElems, unsigned &OpNum) { 4230 bool SeenV1 = false; 4231 bool SeenV2 = false; 4232 4233 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4234 int Idx = SVOp->getMaskElt(i); 4235 // Ignore undef indicies 4236 if (Idx < 0) 4237 continue; 4238 4239 if (Idx < NumElems) 4240 SeenV1 = true; 4241 else 4242 SeenV2 = true; 4243 4244 // Only accept consecutive elements from the same vector 4245 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4246 return false; 4247 } 4248 4249 OpNum = SeenV1 ? 0 : 1; 4250 return true; 4251} 4252 4253/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4254/// logical left shift of a vector. 4255static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4256 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4257 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4258 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4259 false /* check zeros from right */, DAG); 4260 unsigned OpSrc; 4261 4262 if (!NumZeros) 4263 return false; 4264 4265 // Considering the elements in the mask that are not consecutive zeros, 4266 // check if they consecutively come from only one of the source vectors. 4267 // 4268 // V1 = {X, A, B, C} 0 4269 // \ \ \ / 4270 // vector_shuffle V1, V2 <1, 2, 3, X> 4271 // 4272 if (!isShuffleMaskConsecutive(SVOp, 4273 0, // Mask Start Index 4274 NumElems-NumZeros-1, // Mask End Index 4275 NumZeros, // Where to start looking in the src vector 4276 NumElems, // Number of elements in vector 4277 OpSrc)) // Which source operand ? 4278 return false; 4279 4280 isLeft = false; 4281 ShAmt = NumZeros; 4282 ShVal = SVOp->getOperand(OpSrc); 4283 return true; 4284} 4285 4286/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4287/// logical left shift of a vector. 4288static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4289 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4290 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4291 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4292 true /* check zeros from left */, DAG); 4293 unsigned OpSrc; 4294 4295 if (!NumZeros) 4296 return false; 4297 4298 // Considering the elements in the mask that are not consecutive zeros, 4299 // check if they consecutively come from only one of the source vectors. 4300 // 4301 // 0 { A, B, X, X } = V2 4302 // / \ / / 4303 // vector_shuffle V1, V2 <X, X, 4, 5> 4304 // 4305 if (!isShuffleMaskConsecutive(SVOp, 4306 NumZeros, // Mask Start Index 4307 NumElems-1, // Mask End Index 4308 0, // Where to start looking in the src vector 4309 NumElems, // Number of elements in vector 4310 OpSrc)) // Which source operand ? 4311 return false; 4312 4313 isLeft = true; 4314 ShAmt = NumZeros; 4315 ShVal = SVOp->getOperand(OpSrc); 4316 return true; 4317} 4318 4319/// isVectorShift - Returns true if the shuffle can be implemented as a 4320/// logical left or right shift of a vector. 4321static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4322 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4323 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4324 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4325 return true; 4326 4327 return false; 4328} 4329 4330/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4331/// 4332static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4333 unsigned NumNonZero, unsigned NumZero, 4334 SelectionDAG &DAG, 4335 const TargetLowering &TLI) { 4336 if (NumNonZero > 8) 4337 return SDValue(); 4338 4339 DebugLoc dl = Op.getDebugLoc(); 4340 SDValue V(0, 0); 4341 bool First = true; 4342 for (unsigned i = 0; i < 16; ++i) { 4343 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4344 if (ThisIsNonZero && First) { 4345 if (NumZero) 4346 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4347 else 4348 V = DAG.getUNDEF(MVT::v8i16); 4349 First = false; 4350 } 4351 4352 if ((i & 1) != 0) { 4353 SDValue ThisElt(0, 0), LastElt(0, 0); 4354 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4355 if (LastIsNonZero) { 4356 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4357 MVT::i16, Op.getOperand(i-1)); 4358 } 4359 if (ThisIsNonZero) { 4360 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4361 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4362 ThisElt, DAG.getConstant(8, MVT::i8)); 4363 if (LastIsNonZero) 4364 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4365 } else 4366 ThisElt = LastElt; 4367 4368 if (ThisElt.getNode()) 4369 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4370 DAG.getIntPtrConstant(i/2)); 4371 } 4372 } 4373 4374 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4375} 4376 4377/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4378/// 4379static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4380 unsigned NumNonZero, unsigned NumZero, 4381 SelectionDAG &DAG, 4382 const TargetLowering &TLI) { 4383 if (NumNonZero > 4) 4384 return SDValue(); 4385 4386 DebugLoc dl = Op.getDebugLoc(); 4387 SDValue V(0, 0); 4388 bool First = true; 4389 for (unsigned i = 0; i < 8; ++i) { 4390 bool isNonZero = (NonZeros & (1 << i)) != 0; 4391 if (isNonZero) { 4392 if (First) { 4393 if (NumZero) 4394 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4395 else 4396 V = DAG.getUNDEF(MVT::v8i16); 4397 First = false; 4398 } 4399 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4400 MVT::v8i16, V, Op.getOperand(i), 4401 DAG.getIntPtrConstant(i)); 4402 } 4403 } 4404 4405 return V; 4406} 4407 4408/// getVShift - Return a vector logical shift node. 4409/// 4410static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4411 unsigned NumBits, SelectionDAG &DAG, 4412 const TargetLowering &TLI, DebugLoc dl) { 4413 EVT ShVT = MVT::v2i64; 4414 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4415 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4416 return DAG.getNode(ISD::BITCAST, dl, VT, 4417 DAG.getNode(Opc, dl, ShVT, SrcOp, 4418 DAG.getConstant(NumBits, 4419 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4420} 4421 4422SDValue 4423X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4424 SelectionDAG &DAG) const { 4425 4426 // Check if the scalar load can be widened into a vector load. And if 4427 // the address is "base + cst" see if the cst can be "absorbed" into 4428 // the shuffle mask. 4429 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4430 SDValue Ptr = LD->getBasePtr(); 4431 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4432 return SDValue(); 4433 EVT PVT = LD->getValueType(0); 4434 if (PVT != MVT::i32 && PVT != MVT::f32) 4435 return SDValue(); 4436 4437 int FI = -1; 4438 int64_t Offset = 0; 4439 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4440 FI = FINode->getIndex(); 4441 Offset = 0; 4442 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4443 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4444 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4445 Offset = Ptr.getConstantOperandVal(1); 4446 Ptr = Ptr.getOperand(0); 4447 } else { 4448 return SDValue(); 4449 } 4450 4451 SDValue Chain = LD->getChain(); 4452 // Make sure the stack object alignment is at least 16. 4453 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4454 if (DAG.InferPtrAlignment(Ptr) < 16) { 4455 if (MFI->isFixedObjectIndex(FI)) { 4456 // Can't change the alignment. FIXME: It's possible to compute 4457 // the exact stack offset and reference FI + adjust offset instead. 4458 // If someone *really* cares about this. That's the way to implement it. 4459 return SDValue(); 4460 } else { 4461 MFI->setObjectAlignment(FI, 16); 4462 } 4463 } 4464 4465 // (Offset % 16) must be multiple of 4. Then address is then 4466 // Ptr + (Offset & ~15). 4467 if (Offset < 0) 4468 return SDValue(); 4469 if ((Offset % 16) & 3) 4470 return SDValue(); 4471 int64_t StartOffset = Offset & ~15; 4472 if (StartOffset) 4473 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4474 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4475 4476 int EltNo = (Offset - StartOffset) >> 2; 4477 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4478 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4479 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4480 LD->getPointerInfo().getWithOffset(StartOffset), 4481 false, false, 0); 4482 // Canonicalize it to a v4i32 shuffle. 4483 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4484 return DAG.getNode(ISD::BITCAST, dl, VT, 4485 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4486 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4487 } 4488 4489 return SDValue(); 4490} 4491 4492/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4493/// vector of type 'VT', see if the elements can be replaced by a single large 4494/// load which has the same value as a build_vector whose operands are 'elts'. 4495/// 4496/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4497/// 4498/// FIXME: we'd also like to handle the case where the last elements are zero 4499/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4500/// There's even a handy isZeroNode for that purpose. 4501static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4502 DebugLoc &DL, SelectionDAG &DAG) { 4503 EVT EltVT = VT.getVectorElementType(); 4504 unsigned NumElems = Elts.size(); 4505 4506 LoadSDNode *LDBase = NULL; 4507 unsigned LastLoadedElt = -1U; 4508 4509 // For each element in the initializer, see if we've found a load or an undef. 4510 // If we don't find an initial load element, or later load elements are 4511 // non-consecutive, bail out. 4512 for (unsigned i = 0; i < NumElems; ++i) { 4513 SDValue Elt = Elts[i]; 4514 4515 if (!Elt.getNode() || 4516 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4517 return SDValue(); 4518 if (!LDBase) { 4519 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4520 return SDValue(); 4521 LDBase = cast<LoadSDNode>(Elt.getNode()); 4522 LastLoadedElt = i; 4523 continue; 4524 } 4525 if (Elt.getOpcode() == ISD::UNDEF) 4526 continue; 4527 4528 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4529 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4530 return SDValue(); 4531 LastLoadedElt = i; 4532 } 4533 4534 // If we have found an entire vector of loads and undefs, then return a large 4535 // load of the entire vector width starting at the base pointer. If we found 4536 // consecutive loads for the low half, generate a vzext_load node. 4537 if (LastLoadedElt == NumElems - 1) { 4538 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4539 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4540 LDBase->getPointerInfo(), 4541 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4542 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4543 LDBase->getPointerInfo(), 4544 LDBase->isVolatile(), LDBase->isNonTemporal(), 4545 LDBase->getAlignment()); 4546 } else if (NumElems == 4 && LastLoadedElt == 1 && 4547 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4548 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4549 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4550 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4551 Ops, 2, MVT::i32, 4552 LDBase->getMemOperand()); 4553 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4554 } 4555 return SDValue(); 4556} 4557 4558SDValue 4559X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4560 DebugLoc dl = Op.getDebugLoc(); 4561 4562 EVT VT = Op.getValueType(); 4563 EVT ExtVT = VT.getVectorElementType(); 4564 unsigned NumElems = Op.getNumOperands(); 4565 4566 // All zero's: 4567 // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX) 4568 // All one's: 4569 // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX) 4570 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4571 ISD::isBuildVectorAllOnes(Op.getNode())) { 4572 // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to 4573 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4574 // eliminated on x86-32 hosts. 4575 if (Op.getValueType() == MVT::v4i32 || 4576 Op.getValueType() == MVT::v8i32) 4577 return Op; 4578 4579 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4580 return getOnesVector(Op.getValueType(), DAG, dl); 4581 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4582 } 4583 4584 unsigned EVTBits = ExtVT.getSizeInBits(); 4585 4586 unsigned NumZero = 0; 4587 unsigned NumNonZero = 0; 4588 unsigned NonZeros = 0; 4589 bool IsAllConstants = true; 4590 SmallSet<SDValue, 8> Values; 4591 for (unsigned i = 0; i < NumElems; ++i) { 4592 SDValue Elt = Op.getOperand(i); 4593 if (Elt.getOpcode() == ISD::UNDEF) 4594 continue; 4595 Values.insert(Elt); 4596 if (Elt.getOpcode() != ISD::Constant && 4597 Elt.getOpcode() != ISD::ConstantFP) 4598 IsAllConstants = false; 4599 if (X86::isZeroNode(Elt)) 4600 NumZero++; 4601 else { 4602 NonZeros |= (1 << i); 4603 NumNonZero++; 4604 } 4605 } 4606 4607 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4608 if (NumNonZero == 0) 4609 return DAG.getUNDEF(VT); 4610 4611 // Special case for single non-zero, non-undef, element. 4612 if (NumNonZero == 1) { 4613 unsigned Idx = CountTrailingZeros_32(NonZeros); 4614 SDValue Item = Op.getOperand(Idx); 4615 4616 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4617 // the value are obviously zero, truncate the value to i32 and do the 4618 // insertion that way. Only do this if the value is non-constant or if the 4619 // value is a constant being inserted into element 0. It is cheaper to do 4620 // a constant pool load than it is to do a movd + shuffle. 4621 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4622 (!IsAllConstants || Idx == 0)) { 4623 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4624 // Handle SSE only. 4625 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4626 EVT VecVT = MVT::v4i32; 4627 unsigned VecElts = 4; 4628 4629 // Truncate the value (which may itself be a constant) to i32, and 4630 // convert it to a vector with movd (S2V+shuffle to zero extend). 4631 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4632 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4633 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4634 Subtarget->hasSSE2(), DAG); 4635 4636 // Now we have our 32-bit value zero extended in the low element of 4637 // a vector. If Idx != 0, swizzle it into place. 4638 if (Idx != 0) { 4639 SmallVector<int, 4> Mask; 4640 Mask.push_back(Idx); 4641 for (unsigned i = 1; i != VecElts; ++i) 4642 Mask.push_back(i); 4643 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4644 DAG.getUNDEF(Item.getValueType()), 4645 &Mask[0]); 4646 } 4647 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4648 } 4649 } 4650 4651 // If we have a constant or non-constant insertion into the low element of 4652 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4653 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4654 // depending on what the source datatype is. 4655 if (Idx == 0) { 4656 if (NumZero == 0) { 4657 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4658 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4659 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4660 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4661 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4662 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4663 DAG); 4664 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4665 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4666 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4667 EVT MiddleVT = MVT::v4i32; 4668 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4669 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4670 Subtarget->hasSSE2(), DAG); 4671 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4672 } 4673 } 4674 4675 // Is it a vector logical left shift? 4676 if (NumElems == 2 && Idx == 1 && 4677 X86::isZeroNode(Op.getOperand(0)) && 4678 !X86::isZeroNode(Op.getOperand(1))) { 4679 unsigned NumBits = VT.getSizeInBits(); 4680 return getVShift(true, VT, 4681 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4682 VT, Op.getOperand(1)), 4683 NumBits/2, DAG, *this, dl); 4684 } 4685 4686 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4687 return SDValue(); 4688 4689 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4690 // is a non-constant being inserted into an element other than the low one, 4691 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4692 // movd/movss) to move this into the low element, then shuffle it into 4693 // place. 4694 if (EVTBits == 32) { 4695 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4696 4697 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4698 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4699 Subtarget->hasSSE2(), DAG); 4700 SmallVector<int, 8> MaskVec; 4701 for (unsigned i = 0; i < NumElems; i++) 4702 MaskVec.push_back(i == Idx ? 0 : 1); 4703 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4704 } 4705 } 4706 4707 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4708 if (Values.size() == 1) { 4709 if (EVTBits == 32) { 4710 // Instead of a shuffle like this: 4711 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4712 // Check if it's possible to issue this instead. 4713 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4714 unsigned Idx = CountTrailingZeros_32(NonZeros); 4715 SDValue Item = Op.getOperand(Idx); 4716 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4717 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4718 } 4719 return SDValue(); 4720 } 4721 4722 // A vector full of immediates; various special cases are already 4723 // handled, so this is best done with a single constant-pool load. 4724 if (IsAllConstants) 4725 return SDValue(); 4726 4727 // For AVX-length vectors, build the individual 128-bit pieces and use 4728 // shuffles to put them in place. 4729 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 4730 SmallVector<SDValue, 32> V; 4731 for (unsigned i = 0; i < NumElems; ++i) 4732 V.push_back(Op.getOperand(i)); 4733 4734 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4735 4736 // Build both the lower and upper subvector. 4737 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4738 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4739 NumElems/2); 4740 4741 // Recreate the wider vector with the lower and upper part. 4742 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper, 4743 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4744 return Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), 4745 DAG, dl); 4746 } 4747 4748 // Let legalizer expand 2-wide build_vectors. 4749 if (EVTBits == 64) { 4750 if (NumNonZero == 1) { 4751 // One half is zero or undef. 4752 unsigned Idx = CountTrailingZeros_32(NonZeros); 4753 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4754 Op.getOperand(Idx)); 4755 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4756 Subtarget->hasSSE2(), DAG); 4757 } 4758 return SDValue(); 4759 } 4760 4761 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4762 if (EVTBits == 8 && NumElems == 16) { 4763 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4764 *this); 4765 if (V.getNode()) return V; 4766 } 4767 4768 if (EVTBits == 16 && NumElems == 8) { 4769 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4770 *this); 4771 if (V.getNode()) return V; 4772 } 4773 4774 // If element VT is == 32 bits, turn it into a number of shuffles. 4775 SmallVector<SDValue, 8> V; 4776 V.resize(NumElems); 4777 if (NumElems == 4 && NumZero > 0) { 4778 for (unsigned i = 0; i < 4; ++i) { 4779 bool isZero = !(NonZeros & (1 << i)); 4780 if (isZero) 4781 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4782 else 4783 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4784 } 4785 4786 for (unsigned i = 0; i < 2; ++i) { 4787 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4788 default: break; 4789 case 0: 4790 V[i] = V[i*2]; // Must be a zero vector. 4791 break; 4792 case 1: 4793 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4794 break; 4795 case 2: 4796 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4797 break; 4798 case 3: 4799 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4800 break; 4801 } 4802 } 4803 4804 SmallVector<int, 8> MaskVec; 4805 bool Reverse = (NonZeros & 0x3) == 2; 4806 for (unsigned i = 0; i < 2; ++i) 4807 MaskVec.push_back(Reverse ? 1-i : i); 4808 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4809 for (unsigned i = 0; i < 2; ++i) 4810 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4811 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4812 } 4813 4814 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4815 // Check for a build vector of consecutive loads. 4816 for (unsigned i = 0; i < NumElems; ++i) 4817 V[i] = Op.getOperand(i); 4818 4819 // Check for elements which are consecutive loads. 4820 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4821 if (LD.getNode()) 4822 return LD; 4823 4824 // For SSE 4.1, use insertps to put the high elements into the low element. 4825 if (getSubtarget()->hasSSE41()) { 4826 SDValue Result; 4827 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4828 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4829 else 4830 Result = DAG.getUNDEF(VT); 4831 4832 for (unsigned i = 1; i < NumElems; ++i) { 4833 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4834 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4835 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4836 } 4837 return Result; 4838 } 4839 4840 // Otherwise, expand into a number of unpckl*, start by extending each of 4841 // our (non-undef) elements to the full vector width with the element in the 4842 // bottom slot of the vector (which generates no code for SSE). 4843 for (unsigned i = 0; i < NumElems; ++i) { 4844 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4845 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4846 else 4847 V[i] = DAG.getUNDEF(VT); 4848 } 4849 4850 // Next, we iteratively mix elements, e.g. for v4f32: 4851 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4852 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4853 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4854 unsigned EltStride = NumElems >> 1; 4855 while (EltStride != 0) { 4856 for (unsigned i = 0; i < EltStride; ++i) { 4857 // If V[i+EltStride] is undef and this is the first round of mixing, 4858 // then it is safe to just drop this shuffle: V[i] is already in the 4859 // right place, the one element (since it's the first round) being 4860 // inserted as undef can be dropped. This isn't safe for successive 4861 // rounds because they will permute elements within both vectors. 4862 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4863 EltStride == NumElems/2) 4864 continue; 4865 4866 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4867 } 4868 EltStride >>= 1; 4869 } 4870 return V[0]; 4871 } 4872 return SDValue(); 4873} 4874 4875SDValue 4876X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4877 // We support concatenate two MMX registers and place them in a MMX 4878 // register. This is better than doing a stack convert. 4879 DebugLoc dl = Op.getDebugLoc(); 4880 EVT ResVT = Op.getValueType(); 4881 assert(Op.getNumOperands() == 2); 4882 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4883 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4884 int Mask[2]; 4885 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4886 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4887 InVec = Op.getOperand(1); 4888 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4889 unsigned NumElts = ResVT.getVectorNumElements(); 4890 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4891 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4892 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4893 } else { 4894 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4895 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4896 Mask[0] = 0; Mask[1] = 2; 4897 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4898 } 4899 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4900} 4901 4902// v8i16 shuffles - Prefer shuffles in the following order: 4903// 1. [all] pshuflw, pshufhw, optional move 4904// 2. [ssse3] 1 x pshufb 4905// 3. [ssse3] 2 x pshufb + 1 x por 4906// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4907SDValue 4908X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4909 SelectionDAG &DAG) const { 4910 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4911 SDValue V1 = SVOp->getOperand(0); 4912 SDValue V2 = SVOp->getOperand(1); 4913 DebugLoc dl = SVOp->getDebugLoc(); 4914 SmallVector<int, 8> MaskVals; 4915 4916 // Determine if more than 1 of the words in each of the low and high quadwords 4917 // of the result come from the same quadword of one of the two inputs. Undef 4918 // mask values count as coming from any quadword, for better codegen. 4919 SmallVector<unsigned, 4> LoQuad(4); 4920 SmallVector<unsigned, 4> HiQuad(4); 4921 BitVector InputQuads(4); 4922 for (unsigned i = 0; i < 8; ++i) { 4923 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4924 int EltIdx = SVOp->getMaskElt(i); 4925 MaskVals.push_back(EltIdx); 4926 if (EltIdx < 0) { 4927 ++Quad[0]; 4928 ++Quad[1]; 4929 ++Quad[2]; 4930 ++Quad[3]; 4931 continue; 4932 } 4933 ++Quad[EltIdx / 4]; 4934 InputQuads.set(EltIdx / 4); 4935 } 4936 4937 int BestLoQuad = -1; 4938 unsigned MaxQuad = 1; 4939 for (unsigned i = 0; i < 4; ++i) { 4940 if (LoQuad[i] > MaxQuad) { 4941 BestLoQuad = i; 4942 MaxQuad = LoQuad[i]; 4943 } 4944 } 4945 4946 int BestHiQuad = -1; 4947 MaxQuad = 1; 4948 for (unsigned i = 0; i < 4; ++i) { 4949 if (HiQuad[i] > MaxQuad) { 4950 BestHiQuad = i; 4951 MaxQuad = HiQuad[i]; 4952 } 4953 } 4954 4955 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4956 // of the two input vectors, shuffle them into one input vector so only a 4957 // single pshufb instruction is necessary. If There are more than 2 input 4958 // quads, disable the next transformation since it does not help SSSE3. 4959 bool V1Used = InputQuads[0] || InputQuads[1]; 4960 bool V2Used = InputQuads[2] || InputQuads[3]; 4961 if (Subtarget->hasSSSE3()) { 4962 if (InputQuads.count() == 2 && V1Used && V2Used) { 4963 BestLoQuad = InputQuads.find_first(); 4964 BestHiQuad = InputQuads.find_next(BestLoQuad); 4965 } 4966 if (InputQuads.count() > 2) { 4967 BestLoQuad = -1; 4968 BestHiQuad = -1; 4969 } 4970 } 4971 4972 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4973 // the shuffle mask. If a quad is scored as -1, that means that it contains 4974 // words from all 4 input quadwords. 4975 SDValue NewV; 4976 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4977 SmallVector<int, 8> MaskV; 4978 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4979 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4980 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4981 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4982 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4983 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4984 4985 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4986 // source words for the shuffle, to aid later transformations. 4987 bool AllWordsInNewV = true; 4988 bool InOrder[2] = { true, true }; 4989 for (unsigned i = 0; i != 8; ++i) { 4990 int idx = MaskVals[i]; 4991 if (idx != (int)i) 4992 InOrder[i/4] = false; 4993 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4994 continue; 4995 AllWordsInNewV = false; 4996 break; 4997 } 4998 4999 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5000 if (AllWordsInNewV) { 5001 for (int i = 0; i != 8; ++i) { 5002 int idx = MaskVals[i]; 5003 if (idx < 0) 5004 continue; 5005 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5006 if ((idx != i) && idx < 4) 5007 pshufhw = false; 5008 if ((idx != i) && idx > 3) 5009 pshuflw = false; 5010 } 5011 V1 = NewV; 5012 V2Used = false; 5013 BestLoQuad = 0; 5014 BestHiQuad = 1; 5015 } 5016 5017 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5018 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5019 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5020 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5021 unsigned TargetMask = 0; 5022 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5023 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5024 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5025 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5026 V1 = NewV.getOperand(0); 5027 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5028 } 5029 } 5030 5031 // If we have SSSE3, and all words of the result are from 1 input vector, 5032 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5033 // is present, fall back to case 4. 5034 if (Subtarget->hasSSSE3()) { 5035 SmallVector<SDValue,16> pshufbMask; 5036 5037 // If we have elements from both input vectors, set the high bit of the 5038 // shuffle mask element to zero out elements that come from V2 in the V1 5039 // mask, and elements that come from V1 in the V2 mask, so that the two 5040 // results can be OR'd together. 5041 bool TwoInputs = V1Used && V2Used; 5042 for (unsigned i = 0; i != 8; ++i) { 5043 int EltIdx = MaskVals[i] * 2; 5044 if (TwoInputs && (EltIdx >= 16)) { 5045 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5046 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5047 continue; 5048 } 5049 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5050 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5051 } 5052 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5053 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5054 DAG.getNode(ISD::BUILD_VECTOR, dl, 5055 MVT::v16i8, &pshufbMask[0], 16)); 5056 if (!TwoInputs) 5057 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5058 5059 // Calculate the shuffle mask for the second input, shuffle it, and 5060 // OR it with the first shuffled input. 5061 pshufbMask.clear(); 5062 for (unsigned i = 0; i != 8; ++i) { 5063 int EltIdx = MaskVals[i] * 2; 5064 if (EltIdx < 16) { 5065 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5066 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5067 continue; 5068 } 5069 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5070 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5071 } 5072 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5073 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5074 DAG.getNode(ISD::BUILD_VECTOR, dl, 5075 MVT::v16i8, &pshufbMask[0], 16)); 5076 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5077 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5078 } 5079 5080 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5081 // and update MaskVals with new element order. 5082 BitVector InOrder(8); 5083 if (BestLoQuad >= 0) { 5084 SmallVector<int, 8> MaskV; 5085 for (int i = 0; i != 4; ++i) { 5086 int idx = MaskVals[i]; 5087 if (idx < 0) { 5088 MaskV.push_back(-1); 5089 InOrder.set(i); 5090 } else if ((idx / 4) == BestLoQuad) { 5091 MaskV.push_back(idx & 3); 5092 InOrder.set(i); 5093 } else { 5094 MaskV.push_back(-1); 5095 } 5096 } 5097 for (unsigned i = 4; i != 8; ++i) 5098 MaskV.push_back(i); 5099 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5100 &MaskV[0]); 5101 5102 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5103 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5104 NewV.getOperand(0), 5105 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5106 DAG); 5107 } 5108 5109 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5110 // and update MaskVals with the new element order. 5111 if (BestHiQuad >= 0) { 5112 SmallVector<int, 8> MaskV; 5113 for (unsigned i = 0; i != 4; ++i) 5114 MaskV.push_back(i); 5115 for (unsigned i = 4; i != 8; ++i) { 5116 int idx = MaskVals[i]; 5117 if (idx < 0) { 5118 MaskV.push_back(-1); 5119 InOrder.set(i); 5120 } else if ((idx / 4) == BestHiQuad) { 5121 MaskV.push_back((idx & 3) + 4); 5122 InOrder.set(i); 5123 } else { 5124 MaskV.push_back(-1); 5125 } 5126 } 5127 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5128 &MaskV[0]); 5129 5130 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5131 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5132 NewV.getOperand(0), 5133 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5134 DAG); 5135 } 5136 5137 // In case BestHi & BestLo were both -1, which means each quadword has a word 5138 // from each of the four input quadwords, calculate the InOrder bitvector now 5139 // before falling through to the insert/extract cleanup. 5140 if (BestLoQuad == -1 && BestHiQuad == -1) { 5141 NewV = V1; 5142 for (int i = 0; i != 8; ++i) 5143 if (MaskVals[i] < 0 || MaskVals[i] == i) 5144 InOrder.set(i); 5145 } 5146 5147 // The other elements are put in the right place using pextrw and pinsrw. 5148 for (unsigned i = 0; i != 8; ++i) { 5149 if (InOrder[i]) 5150 continue; 5151 int EltIdx = MaskVals[i]; 5152 if (EltIdx < 0) 5153 continue; 5154 SDValue ExtOp = (EltIdx < 8) 5155 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5156 DAG.getIntPtrConstant(EltIdx)) 5157 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5158 DAG.getIntPtrConstant(EltIdx - 8)); 5159 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5160 DAG.getIntPtrConstant(i)); 5161 } 5162 return NewV; 5163} 5164 5165// v16i8 shuffles - Prefer shuffles in the following order: 5166// 1. [ssse3] 1 x pshufb 5167// 2. [ssse3] 2 x pshufb + 1 x por 5168// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5169static 5170SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5171 SelectionDAG &DAG, 5172 const X86TargetLowering &TLI) { 5173 SDValue V1 = SVOp->getOperand(0); 5174 SDValue V2 = SVOp->getOperand(1); 5175 DebugLoc dl = SVOp->getDebugLoc(); 5176 SmallVector<int, 16> MaskVals; 5177 SVOp->getMask(MaskVals); 5178 5179 // If we have SSSE3, case 1 is generated when all result bytes come from 5180 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5181 // present, fall back to case 3. 5182 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5183 bool V1Only = true; 5184 bool V2Only = true; 5185 for (unsigned i = 0; i < 16; ++i) { 5186 int EltIdx = MaskVals[i]; 5187 if (EltIdx < 0) 5188 continue; 5189 if (EltIdx < 16) 5190 V2Only = false; 5191 else 5192 V1Only = false; 5193 } 5194 5195 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5196 if (TLI.getSubtarget()->hasSSSE3()) { 5197 SmallVector<SDValue,16> pshufbMask; 5198 5199 // If all result elements are from one input vector, then only translate 5200 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5201 // 5202 // Otherwise, we have elements from both input vectors, and must zero out 5203 // elements that come from V2 in the first mask, and V1 in the second mask 5204 // so that we can OR them together. 5205 bool TwoInputs = !(V1Only || V2Only); 5206 for (unsigned i = 0; i != 16; ++i) { 5207 int EltIdx = MaskVals[i]; 5208 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5209 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5210 continue; 5211 } 5212 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5213 } 5214 // If all the elements are from V2, assign it to V1 and return after 5215 // building the first pshufb. 5216 if (V2Only) 5217 V1 = V2; 5218 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5219 DAG.getNode(ISD::BUILD_VECTOR, dl, 5220 MVT::v16i8, &pshufbMask[0], 16)); 5221 if (!TwoInputs) 5222 return V1; 5223 5224 // Calculate the shuffle mask for the second input, shuffle it, and 5225 // OR it with the first shuffled input. 5226 pshufbMask.clear(); 5227 for (unsigned i = 0; i != 16; ++i) { 5228 int EltIdx = MaskVals[i]; 5229 if (EltIdx < 16) { 5230 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5231 continue; 5232 } 5233 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5234 } 5235 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5236 DAG.getNode(ISD::BUILD_VECTOR, dl, 5237 MVT::v16i8, &pshufbMask[0], 16)); 5238 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5239 } 5240 5241 // No SSSE3 - Calculate in place words and then fix all out of place words 5242 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5243 // the 16 different words that comprise the two doublequadword input vectors. 5244 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5245 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5246 SDValue NewV = V2Only ? V2 : V1; 5247 for (int i = 0; i != 8; ++i) { 5248 int Elt0 = MaskVals[i*2]; 5249 int Elt1 = MaskVals[i*2+1]; 5250 5251 // This word of the result is all undef, skip it. 5252 if (Elt0 < 0 && Elt1 < 0) 5253 continue; 5254 5255 // This word of the result is already in the correct place, skip it. 5256 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5257 continue; 5258 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5259 continue; 5260 5261 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5262 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5263 SDValue InsElt; 5264 5265 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5266 // using a single extract together, load it and store it. 5267 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5268 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5269 DAG.getIntPtrConstant(Elt1 / 2)); 5270 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5271 DAG.getIntPtrConstant(i)); 5272 continue; 5273 } 5274 5275 // If Elt1 is defined, extract it from the appropriate source. If the 5276 // source byte is not also odd, shift the extracted word left 8 bits 5277 // otherwise clear the bottom 8 bits if we need to do an or. 5278 if (Elt1 >= 0) { 5279 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5280 DAG.getIntPtrConstant(Elt1 / 2)); 5281 if ((Elt1 & 1) == 0) 5282 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5283 DAG.getConstant(8, 5284 TLI.getShiftAmountTy(InsElt.getValueType()))); 5285 else if (Elt0 >= 0) 5286 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5287 DAG.getConstant(0xFF00, MVT::i16)); 5288 } 5289 // If Elt0 is defined, extract it from the appropriate source. If the 5290 // source byte is not also even, shift the extracted word right 8 bits. If 5291 // Elt1 was also defined, OR the extracted values together before 5292 // inserting them in the result. 5293 if (Elt0 >= 0) { 5294 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5295 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5296 if ((Elt0 & 1) != 0) 5297 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5298 DAG.getConstant(8, 5299 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5300 else if (Elt1 >= 0) 5301 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5302 DAG.getConstant(0x00FF, MVT::i16)); 5303 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5304 : InsElt0; 5305 } 5306 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5307 DAG.getIntPtrConstant(i)); 5308 } 5309 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5310} 5311 5312/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5313/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5314/// done when every pair / quad of shuffle mask elements point to elements in 5315/// the right sequence. e.g. 5316/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5317static 5318SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5319 SelectionDAG &DAG, DebugLoc dl) { 5320 EVT VT = SVOp->getValueType(0); 5321 SDValue V1 = SVOp->getOperand(0); 5322 SDValue V2 = SVOp->getOperand(1); 5323 unsigned NumElems = VT.getVectorNumElements(); 5324 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5325 EVT NewVT; 5326 switch (VT.getSimpleVT().SimpleTy) { 5327 default: assert(false && "Unexpected!"); 5328 case MVT::v4f32: NewVT = MVT::v2f64; break; 5329 case MVT::v4i32: NewVT = MVT::v2i64; break; 5330 case MVT::v8i16: NewVT = MVT::v4i32; break; 5331 case MVT::v16i8: NewVT = MVT::v4i32; break; 5332 } 5333 5334 int Scale = NumElems / NewWidth; 5335 SmallVector<int, 8> MaskVec; 5336 for (unsigned i = 0; i < NumElems; i += Scale) { 5337 int StartIdx = -1; 5338 for (int j = 0; j < Scale; ++j) { 5339 int EltIdx = SVOp->getMaskElt(i+j); 5340 if (EltIdx < 0) 5341 continue; 5342 if (StartIdx == -1) 5343 StartIdx = EltIdx - (EltIdx % Scale); 5344 if (EltIdx != StartIdx + j) 5345 return SDValue(); 5346 } 5347 if (StartIdx == -1) 5348 MaskVec.push_back(-1); 5349 else 5350 MaskVec.push_back(StartIdx / Scale); 5351 } 5352 5353 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5354 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5355 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5356} 5357 5358/// getVZextMovL - Return a zero-extending vector move low node. 5359/// 5360static SDValue getVZextMovL(EVT VT, EVT OpVT, 5361 SDValue SrcOp, SelectionDAG &DAG, 5362 const X86Subtarget *Subtarget, DebugLoc dl) { 5363 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5364 LoadSDNode *LD = NULL; 5365 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5366 LD = dyn_cast<LoadSDNode>(SrcOp); 5367 if (!LD) { 5368 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5369 // instead. 5370 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5371 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5372 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5373 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5374 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5375 // PR2108 5376 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5377 return DAG.getNode(ISD::BITCAST, dl, VT, 5378 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5379 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5380 OpVT, 5381 SrcOp.getOperand(0) 5382 .getOperand(0)))); 5383 } 5384 } 5385 } 5386 5387 return DAG.getNode(ISD::BITCAST, dl, VT, 5388 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5389 DAG.getNode(ISD::BITCAST, dl, 5390 OpVT, SrcOp))); 5391} 5392 5393/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5394/// which could not be matched by any known target speficic shuffle 5395static SDValue 5396LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5397 return SDValue(); 5398} 5399 5400/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5401/// 4 elements, and match them with several different shuffle types. 5402static SDValue 5403LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5404 SDValue V1 = SVOp->getOperand(0); 5405 SDValue V2 = SVOp->getOperand(1); 5406 DebugLoc dl = SVOp->getDebugLoc(); 5407 EVT VT = SVOp->getValueType(0); 5408 5409 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5410 5411 SmallVector<std::pair<int, int>, 8> Locs; 5412 Locs.resize(4); 5413 SmallVector<int, 8> Mask1(4U, -1); 5414 SmallVector<int, 8> PermMask; 5415 SVOp->getMask(PermMask); 5416 5417 unsigned NumHi = 0; 5418 unsigned NumLo = 0; 5419 for (unsigned i = 0; i != 4; ++i) { 5420 int Idx = PermMask[i]; 5421 if (Idx < 0) { 5422 Locs[i] = std::make_pair(-1, -1); 5423 } else { 5424 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5425 if (Idx < 4) { 5426 Locs[i] = std::make_pair(0, NumLo); 5427 Mask1[NumLo] = Idx; 5428 NumLo++; 5429 } else { 5430 Locs[i] = std::make_pair(1, NumHi); 5431 if (2+NumHi < 4) 5432 Mask1[2+NumHi] = Idx; 5433 NumHi++; 5434 } 5435 } 5436 } 5437 5438 if (NumLo <= 2 && NumHi <= 2) { 5439 // If no more than two elements come from either vector. This can be 5440 // implemented with two shuffles. First shuffle gather the elements. 5441 // The second shuffle, which takes the first shuffle as both of its 5442 // vector operands, put the elements into the right order. 5443 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5444 5445 SmallVector<int, 8> Mask2(4U, -1); 5446 5447 for (unsigned i = 0; i != 4; ++i) { 5448 if (Locs[i].first == -1) 5449 continue; 5450 else { 5451 unsigned Idx = (i < 2) ? 0 : 4; 5452 Idx += Locs[i].first * 2 + Locs[i].second; 5453 Mask2[i] = Idx; 5454 } 5455 } 5456 5457 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5458 } else if (NumLo == 3 || NumHi == 3) { 5459 // Otherwise, we must have three elements from one vector, call it X, and 5460 // one element from the other, call it Y. First, use a shufps to build an 5461 // intermediate vector with the one element from Y and the element from X 5462 // that will be in the same half in the final destination (the indexes don't 5463 // matter). Then, use a shufps to build the final vector, taking the half 5464 // containing the element from Y from the intermediate, and the other half 5465 // from X. 5466 if (NumHi == 3) { 5467 // Normalize it so the 3 elements come from V1. 5468 CommuteVectorShuffleMask(PermMask, VT); 5469 std::swap(V1, V2); 5470 } 5471 5472 // Find the element from V2. 5473 unsigned HiIndex; 5474 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5475 int Val = PermMask[HiIndex]; 5476 if (Val < 0) 5477 continue; 5478 if (Val >= 4) 5479 break; 5480 } 5481 5482 Mask1[0] = PermMask[HiIndex]; 5483 Mask1[1] = -1; 5484 Mask1[2] = PermMask[HiIndex^1]; 5485 Mask1[3] = -1; 5486 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5487 5488 if (HiIndex >= 2) { 5489 Mask1[0] = PermMask[0]; 5490 Mask1[1] = PermMask[1]; 5491 Mask1[2] = HiIndex & 1 ? 6 : 4; 5492 Mask1[3] = HiIndex & 1 ? 4 : 6; 5493 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5494 } else { 5495 Mask1[0] = HiIndex & 1 ? 2 : 0; 5496 Mask1[1] = HiIndex & 1 ? 0 : 2; 5497 Mask1[2] = PermMask[2]; 5498 Mask1[3] = PermMask[3]; 5499 if (Mask1[2] >= 0) 5500 Mask1[2] += 4; 5501 if (Mask1[3] >= 0) 5502 Mask1[3] += 4; 5503 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5504 } 5505 } 5506 5507 // Break it into (shuffle shuffle_hi, shuffle_lo). 5508 Locs.clear(); 5509 Locs.resize(4); 5510 SmallVector<int,8> LoMask(4U, -1); 5511 SmallVector<int,8> HiMask(4U, -1); 5512 5513 SmallVector<int,8> *MaskPtr = &LoMask; 5514 unsigned MaskIdx = 0; 5515 unsigned LoIdx = 0; 5516 unsigned HiIdx = 2; 5517 for (unsigned i = 0; i != 4; ++i) { 5518 if (i == 2) { 5519 MaskPtr = &HiMask; 5520 MaskIdx = 1; 5521 LoIdx = 0; 5522 HiIdx = 2; 5523 } 5524 int Idx = PermMask[i]; 5525 if (Idx < 0) { 5526 Locs[i] = std::make_pair(-1, -1); 5527 } else if (Idx < 4) { 5528 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5529 (*MaskPtr)[LoIdx] = Idx; 5530 LoIdx++; 5531 } else { 5532 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5533 (*MaskPtr)[HiIdx] = Idx; 5534 HiIdx++; 5535 } 5536 } 5537 5538 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5539 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5540 SmallVector<int, 8> MaskOps; 5541 for (unsigned i = 0; i != 4; ++i) { 5542 if (Locs[i].first == -1) { 5543 MaskOps.push_back(-1); 5544 } else { 5545 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5546 MaskOps.push_back(Idx); 5547 } 5548 } 5549 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5550} 5551 5552static bool MayFoldVectorLoad(SDValue V) { 5553 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5554 V = V.getOperand(0); 5555 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5556 V = V.getOperand(0); 5557 if (MayFoldLoad(V)) 5558 return true; 5559 return false; 5560} 5561 5562// FIXME: the version above should always be used. Since there's 5563// a bug where several vector shuffles can't be folded because the 5564// DAG is not updated during lowering and a node claims to have two 5565// uses while it only has one, use this version, and let isel match 5566// another instruction if the load really happens to have more than 5567// one use. Remove this version after this bug get fixed. 5568// rdar://8434668, PR8156 5569static bool RelaxedMayFoldVectorLoad(SDValue V) { 5570 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5571 V = V.getOperand(0); 5572 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5573 V = V.getOperand(0); 5574 if (ISD::isNormalLoad(V.getNode())) 5575 return true; 5576 return false; 5577} 5578 5579/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5580/// a vector extract, and if both can be later optimized into a single load. 5581/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5582/// here because otherwise a target specific shuffle node is going to be 5583/// emitted for this shuffle, and the optimization not done. 5584/// FIXME: This is probably not the best approach, but fix the problem 5585/// until the right path is decided. 5586static 5587bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5588 const TargetLowering &TLI) { 5589 EVT VT = V.getValueType(); 5590 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5591 5592 // Be sure that the vector shuffle is present in a pattern like this: 5593 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5594 if (!V.hasOneUse()) 5595 return false; 5596 5597 SDNode *N = *V.getNode()->use_begin(); 5598 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5599 return false; 5600 5601 SDValue EltNo = N->getOperand(1); 5602 if (!isa<ConstantSDNode>(EltNo)) 5603 return false; 5604 5605 // If the bit convert changed the number of elements, it is unsafe 5606 // to examine the mask. 5607 bool HasShuffleIntoBitcast = false; 5608 if (V.getOpcode() == ISD::BITCAST) { 5609 EVT SrcVT = V.getOperand(0).getValueType(); 5610 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5611 return false; 5612 V = V.getOperand(0); 5613 HasShuffleIntoBitcast = true; 5614 } 5615 5616 // Select the input vector, guarding against out of range extract vector. 5617 unsigned NumElems = VT.getVectorNumElements(); 5618 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5619 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5620 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5621 5622 // Skip one more bit_convert if necessary 5623 if (V.getOpcode() == ISD::BITCAST) 5624 V = V.getOperand(0); 5625 5626 if (ISD::isNormalLoad(V.getNode())) { 5627 // Is the original load suitable? 5628 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5629 5630 // FIXME: avoid the multi-use bug that is preventing lots of 5631 // of foldings to be detected, this is still wrong of course, but 5632 // give the temporary desired behavior, and if it happens that 5633 // the load has real more uses, during isel it will not fold, and 5634 // will generate poor code. 5635 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5636 return false; 5637 5638 if (!HasShuffleIntoBitcast) 5639 return true; 5640 5641 // If there's a bitcast before the shuffle, check if the load type and 5642 // alignment is valid. 5643 unsigned Align = LN0->getAlignment(); 5644 unsigned NewAlign = 5645 TLI.getTargetData()->getABITypeAlignment( 5646 VT.getTypeForEVT(*DAG.getContext())); 5647 5648 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5649 return false; 5650 } 5651 5652 return true; 5653} 5654 5655static 5656SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5657 EVT VT = Op.getValueType(); 5658 5659 // Canonizalize to v2f64. 5660 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5661 return DAG.getNode(ISD::BITCAST, dl, VT, 5662 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5663 V1, DAG)); 5664} 5665 5666static 5667SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5668 bool HasSSE2) { 5669 SDValue V1 = Op.getOperand(0); 5670 SDValue V2 = Op.getOperand(1); 5671 EVT VT = Op.getValueType(); 5672 5673 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5674 5675 if (HasSSE2 && VT == MVT::v2f64) 5676 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5677 5678 // v4f32 or v4i32 5679 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5680} 5681 5682static 5683SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5684 SDValue V1 = Op.getOperand(0); 5685 SDValue V2 = Op.getOperand(1); 5686 EVT VT = Op.getValueType(); 5687 5688 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5689 "unsupported shuffle type"); 5690 5691 if (V2.getOpcode() == ISD::UNDEF) 5692 V2 = V1; 5693 5694 // v4i32 or v4f32 5695 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5696} 5697 5698static 5699SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5700 SDValue V1 = Op.getOperand(0); 5701 SDValue V2 = Op.getOperand(1); 5702 EVT VT = Op.getValueType(); 5703 unsigned NumElems = VT.getVectorNumElements(); 5704 5705 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5706 // operand of these instructions is only memory, so check if there's a 5707 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5708 // same masks. 5709 bool CanFoldLoad = false; 5710 5711 // Trivial case, when V2 comes from a load. 5712 if (MayFoldVectorLoad(V2)) 5713 CanFoldLoad = true; 5714 5715 // When V1 is a load, it can be folded later into a store in isel, example: 5716 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5717 // turns into: 5718 // (MOVLPSmr addr:$src1, VR128:$src2) 5719 // So, recognize this potential and also use MOVLPS or MOVLPD 5720 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5721 CanFoldLoad = true; 5722 5723 // Both of them can't be memory operations though. 5724 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 5725 CanFoldLoad = false; 5726 5727 if (CanFoldLoad) { 5728 if (HasSSE2 && NumElems == 2) 5729 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5730 5731 if (NumElems == 4) 5732 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5733 } 5734 5735 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5736 // movl and movlp will both match v2i64, but v2i64 is never matched by 5737 // movl earlier because we make it strict to avoid messing with the movlp load 5738 // folding logic (see the code above getMOVLP call). Match it here then, 5739 // this is horrible, but will stay like this until we move all shuffle 5740 // matching to x86 specific nodes. Note that for the 1st condition all 5741 // types are matched with movsd. 5742 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5743 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5744 else if (HasSSE2) 5745 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5746 5747 5748 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5749 5750 // Invert the operand order and use SHUFPS to match it. 5751 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5752 X86::getShuffleSHUFImmediate(SVOp), DAG); 5753} 5754 5755static inline unsigned getUNPCKLOpcode(EVT VT) { 5756 switch(VT.getSimpleVT().SimpleTy) { 5757 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5758 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5759 case MVT::v4f32: return X86ISD::UNPCKLPS; 5760 case MVT::v2f64: return X86ISD::UNPCKLPD; 5761 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 5762 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 5763 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5764 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5765 default: 5766 llvm_unreachable("Unknown type for unpckl"); 5767 } 5768 return 0; 5769} 5770 5771static inline unsigned getUNPCKHOpcode(EVT VT) { 5772 switch(VT.getSimpleVT().SimpleTy) { 5773 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5774 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5775 case MVT::v4f32: return X86ISD::UNPCKHPS; 5776 case MVT::v2f64: return X86ISD::UNPCKHPD; 5777 case MVT::v8f32: return X86ISD::VUNPCKHPSY; 5778 case MVT::v4f64: return X86ISD::VUNPCKHPDY; 5779 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5780 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5781 default: 5782 llvm_unreachable("Unknown type for unpckh"); 5783 } 5784 return 0; 5785} 5786 5787static 5788SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5789 const TargetLowering &TLI, 5790 const X86Subtarget *Subtarget) { 5791 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5792 EVT VT = Op.getValueType(); 5793 DebugLoc dl = Op.getDebugLoc(); 5794 SDValue V1 = Op.getOperand(0); 5795 SDValue V2 = Op.getOperand(1); 5796 5797 if (isZeroShuffle(SVOp)) 5798 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5799 5800 // Handle splat operations 5801 if (SVOp->isSplat()) { 5802 unsigned NumElem = VT.getVectorNumElements(); 5803 // Special case, this is the only place now where it's allowed to return 5804 // a vector_shuffle operation without using a target specific node, because 5805 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 5806 // this be moved to DAGCombine instead? 5807 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5808 return Op; 5809 5810 // Since there's no native support for scalar_to_vector for 256-bit AVX, a 5811 // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this 5812 // idiom and do the shuffle before the insertion, this yields less 5813 // instructions in the end. 5814 if (VT.is256BitVector() && 5815 V1.getOpcode() == ISD::INSERT_SUBVECTOR && 5816 V1.getOperand(0).getOpcode() == ISD::UNDEF && 5817 V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR) 5818 return PromoteVectorToScalarSplat(SVOp, DAG); 5819 5820 // Handle splats by matching through known shuffle masks 5821 if ((VT.is128BitVector() && NumElem <= 4) || 5822 (VT.is256BitVector() && NumElem <= 8)) 5823 return SDValue(); 5824 5825 // All i16 and i8 vector types can't be used directly by a generic shuffle 5826 // instruction because the target has no such instruction. Generate shuffles 5827 // which repeat i16 and i8 several times until they fit in i32, and then can 5828 // be manipulated by target suported shuffles. After the insertion of the 5829 // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. 5830 return PromoteSplat(SVOp, DAG); 5831 } 5832 5833 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5834 // do it! 5835 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5836 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5837 if (NewOp.getNode()) 5838 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5839 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5840 // FIXME: Figure out a cleaner way to do this. 5841 // Try to make use of movq to zero out the top part. 5842 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5843 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5844 if (NewOp.getNode()) { 5845 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5846 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5847 DAG, Subtarget, dl); 5848 } 5849 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5850 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5851 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5852 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5853 DAG, Subtarget, dl); 5854 } 5855 } 5856 return SDValue(); 5857} 5858 5859SDValue 5860X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5861 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5862 SDValue V1 = Op.getOperand(0); 5863 SDValue V2 = Op.getOperand(1); 5864 EVT VT = Op.getValueType(); 5865 DebugLoc dl = Op.getDebugLoc(); 5866 unsigned NumElems = VT.getVectorNumElements(); 5867 bool isMMX = VT.getSizeInBits() == 64; 5868 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5869 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5870 bool V1IsSplat = false; 5871 bool V2IsSplat = false; 5872 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5873 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5874 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5875 MachineFunction &MF = DAG.getMachineFunction(); 5876 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5877 5878 // Shuffle operations on MMX not supported. 5879 if (isMMX) 5880 return Op; 5881 5882 // Vector shuffle lowering takes 3 steps: 5883 // 5884 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5885 // narrowing and commutation of operands should be handled. 5886 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5887 // shuffle nodes. 5888 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5889 // so the shuffle can be broken into other shuffles and the legalizer can 5890 // try the lowering again. 5891 // 5892 // The general ideia is that no vector_shuffle operation should be left to 5893 // be matched during isel, all of them must be converted to a target specific 5894 // node here. 5895 5896 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5897 // narrowing and commutation of operands should be handled. The actual code 5898 // doesn't include all of those, work in progress... 5899 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5900 if (NewOp.getNode()) 5901 return NewOp; 5902 5903 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5904 // unpckh_undef). Only use pshufd if speed is more important than size. 5905 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5906 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5907 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5908 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5909 5910 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5911 RelaxedMayFoldVectorLoad(V1)) 5912 return getMOVDDup(Op, dl, V1, DAG); 5913 5914 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5915 return getMOVHighToLow(Op, dl, DAG); 5916 5917 // Use to match splats 5918 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5919 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5920 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5921 5922 if (X86::isPSHUFDMask(SVOp)) { 5923 // The actual implementation will match the mask in the if above and then 5924 // during isel it can match several different instructions, not only pshufd 5925 // as its name says, sad but true, emulate the behavior for now... 5926 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5927 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5928 5929 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5930 5931 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5932 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5933 5934 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5935 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5936 TargetMask, DAG); 5937 5938 if (VT == MVT::v4f32) 5939 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5940 TargetMask, DAG); 5941 } 5942 5943 // Check if this can be converted into a logical shift. 5944 bool isLeft = false; 5945 unsigned ShAmt = 0; 5946 SDValue ShVal; 5947 bool isShift = getSubtarget()->hasSSE2() && 5948 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5949 if (isShift && ShVal.hasOneUse()) { 5950 // If the shifted value has multiple uses, it may be cheaper to use 5951 // v_set0 + movlhps or movhlps, etc. 5952 EVT EltVT = VT.getVectorElementType(); 5953 ShAmt *= EltVT.getSizeInBits(); 5954 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5955 } 5956 5957 if (X86::isMOVLMask(SVOp)) { 5958 if (V1IsUndef) 5959 return V2; 5960 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5961 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5962 if (!X86::isMOVLPMask(SVOp)) { 5963 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5964 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5965 5966 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5967 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5968 } 5969 } 5970 5971 // FIXME: fold these into legal mask. 5972 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5973 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5974 5975 if (X86::isMOVHLPSMask(SVOp)) 5976 return getMOVHighToLow(Op, dl, DAG); 5977 5978 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 5979 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5980 5981 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 5982 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5983 5984 if (X86::isMOVLPMask(SVOp)) 5985 return getMOVLP(Op, dl, DAG, HasSSE2); 5986 5987 if (ShouldXformToMOVHLPS(SVOp) || 5988 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5989 return CommuteVectorShuffle(SVOp, DAG); 5990 5991 if (isShift) { 5992 // No better options. Use a vshl / vsrl. 5993 EVT EltVT = VT.getVectorElementType(); 5994 ShAmt *= EltVT.getSizeInBits(); 5995 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5996 } 5997 5998 bool Commuted = false; 5999 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6000 // 1,1,1,1 -> v8i16 though. 6001 V1IsSplat = isSplatVector(V1.getNode()); 6002 V2IsSplat = isSplatVector(V2.getNode()); 6003 6004 // Canonicalize the splat or undef, if present, to be on the RHS. 6005 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 6006 Op = CommuteVectorShuffle(SVOp, DAG); 6007 SVOp = cast<ShuffleVectorSDNode>(Op); 6008 V1 = SVOp->getOperand(0); 6009 V2 = SVOp->getOperand(1); 6010 std::swap(V1IsSplat, V2IsSplat); 6011 std::swap(V1IsUndef, V2IsUndef); 6012 Commuted = true; 6013 } 6014 6015 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 6016 // Shuffling low element of v1 into undef, just return v1. 6017 if (V2IsUndef) 6018 return V1; 6019 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6020 // the instruction selector will not match, so get a canonical MOVL with 6021 // swapped operands to undo the commute. 6022 return getMOVL(DAG, dl, VT, V2, V1); 6023 } 6024 6025 if (X86::isUNPCKLMask(SVOp)) 6026 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 6027 6028 if (X86::isUNPCKHMask(SVOp)) 6029 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 6030 6031 if (V2IsSplat) { 6032 // Normalize mask so all entries that point to V2 points to its first 6033 // element then try to match unpck{h|l} again. If match, return a 6034 // new vector_shuffle with the corrected mask. 6035 SDValue NewMask = NormalizeMask(SVOp, DAG); 6036 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6037 if (NSVOp != SVOp) { 6038 if (X86::isUNPCKLMask(NSVOp, true)) { 6039 return NewMask; 6040 } else if (X86::isUNPCKHMask(NSVOp, true)) { 6041 return NewMask; 6042 } 6043 } 6044 } 6045 6046 if (Commuted) { 6047 // Commute is back and try unpck* again. 6048 // FIXME: this seems wrong. 6049 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6050 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6051 6052 if (X86::isUNPCKLMask(NewSVOp)) 6053 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 6054 6055 if (X86::isUNPCKHMask(NewSVOp)) 6056 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 6057 } 6058 6059 // Normalize the node to match x86 shuffle ops if needed 6060 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 6061 return CommuteVectorShuffle(SVOp, DAG); 6062 6063 // The checks below are all present in isShuffleMaskLegal, but they are 6064 // inlined here right now to enable us to directly emit target specific 6065 // nodes, and remove one by one until they don't return Op anymore. 6066 SmallVector<int, 16> M; 6067 SVOp->getMask(M); 6068 6069 if (isPALIGNRMask(M, VT, HasSSSE3)) 6070 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6071 X86::getShufflePALIGNRImmediate(SVOp), 6072 DAG); 6073 6074 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6075 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6076 if (VT == MVT::v2f64) 6077 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 6078 if (VT == MVT::v2i64) 6079 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 6080 } 6081 6082 if (isPSHUFHWMask(M, VT)) 6083 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6084 X86::getShufflePSHUFHWImmediate(SVOp), 6085 DAG); 6086 6087 if (isPSHUFLWMask(M, VT)) 6088 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6089 X86::getShufflePSHUFLWImmediate(SVOp), 6090 DAG); 6091 6092 if (isSHUFPMask(M, VT)) { 6093 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6094 if (VT == MVT::v4f32 || VT == MVT::v4i32) 6095 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 6096 TargetMask, DAG); 6097 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6098 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 6099 TargetMask, DAG); 6100 } 6101 6102 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 6103 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6104 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 6105 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6106 6107 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6108 if (VT == MVT::v8i16) { 6109 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6110 if (NewOp.getNode()) 6111 return NewOp; 6112 } 6113 6114 if (VT == MVT::v16i8) { 6115 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6116 if (NewOp.getNode()) 6117 return NewOp; 6118 } 6119 6120 // Handle all 128-bit wide vectors with 4 elements, and match them with 6121 // several different shuffle types. 6122 if (NumElems == 4 && VT.getSizeInBits() == 128) 6123 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6124 6125 //===--------------------------------------------------------------------===// 6126 // Custom lower or generate target specific nodes for 256-bit shuffles. 6127 6128 // Handle VPERMIL permutations 6129 if (isVPERMILMask(M, VT)) { 6130 unsigned TargetMask = getShuffleVPERMILImmediate(SVOp); 6131 if (VT == MVT::v8f32) 6132 return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG); 6133 } 6134 6135 // Handle general 256-bit shuffles 6136 if (VT.is256BitVector()) 6137 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6138 6139 return SDValue(); 6140} 6141 6142SDValue 6143X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6144 SelectionDAG &DAG) const { 6145 EVT VT = Op.getValueType(); 6146 DebugLoc dl = Op.getDebugLoc(); 6147 if (VT.getSizeInBits() == 8) { 6148 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6149 Op.getOperand(0), Op.getOperand(1)); 6150 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6151 DAG.getValueType(VT)); 6152 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6153 } else if (VT.getSizeInBits() == 16) { 6154 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6155 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6156 if (Idx == 0) 6157 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6158 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6159 DAG.getNode(ISD::BITCAST, dl, 6160 MVT::v4i32, 6161 Op.getOperand(0)), 6162 Op.getOperand(1))); 6163 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6164 Op.getOperand(0), Op.getOperand(1)); 6165 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6166 DAG.getValueType(VT)); 6167 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6168 } else if (VT == MVT::f32) { 6169 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6170 // the result back to FR32 register. It's only worth matching if the 6171 // result has a single use which is a store or a bitcast to i32. And in 6172 // the case of a store, it's not worth it if the index is a constant 0, 6173 // because a MOVSSmr can be used instead, which is smaller and faster. 6174 if (!Op.hasOneUse()) 6175 return SDValue(); 6176 SDNode *User = *Op.getNode()->use_begin(); 6177 if ((User->getOpcode() != ISD::STORE || 6178 (isa<ConstantSDNode>(Op.getOperand(1)) && 6179 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6180 (User->getOpcode() != ISD::BITCAST || 6181 User->getValueType(0) != MVT::i32)) 6182 return SDValue(); 6183 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6184 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6185 Op.getOperand(0)), 6186 Op.getOperand(1)); 6187 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6188 } else if (VT == MVT::i32) { 6189 // ExtractPS works with constant index. 6190 if (isa<ConstantSDNode>(Op.getOperand(1))) 6191 return Op; 6192 } 6193 return SDValue(); 6194} 6195 6196 6197SDValue 6198X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6199 SelectionDAG &DAG) const { 6200 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6201 return SDValue(); 6202 6203 SDValue Vec = Op.getOperand(0); 6204 EVT VecVT = Vec.getValueType(); 6205 6206 // If this is a 256-bit vector result, first extract the 128-bit 6207 // vector and then extract from the 128-bit vector. 6208 if (VecVT.getSizeInBits() > 128) { 6209 DebugLoc dl = Op.getNode()->getDebugLoc(); 6210 unsigned NumElems = VecVT.getVectorNumElements(); 6211 SDValue Idx = Op.getOperand(1); 6212 6213 if (!isa<ConstantSDNode>(Idx)) 6214 return SDValue(); 6215 6216 unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128); 6217 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6218 6219 // Get the 128-bit vector. 6220 bool Upper = IdxVal >= ExtractNumElems; 6221 Vec = Extract128BitVector(Vec, Idx, DAG, dl); 6222 6223 // Extract from it. 6224 SDValue ScaledIdx = Idx; 6225 if (Upper) 6226 ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx, 6227 DAG.getConstant(ExtractNumElems, 6228 Idx.getValueType())); 6229 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6230 ScaledIdx); 6231 } 6232 6233 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6234 6235 if (Subtarget->hasSSE41()) { 6236 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6237 if (Res.getNode()) 6238 return Res; 6239 } 6240 6241 EVT VT = Op.getValueType(); 6242 DebugLoc dl = Op.getDebugLoc(); 6243 // TODO: handle v16i8. 6244 if (VT.getSizeInBits() == 16) { 6245 SDValue Vec = Op.getOperand(0); 6246 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6247 if (Idx == 0) 6248 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6249 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6250 DAG.getNode(ISD::BITCAST, dl, 6251 MVT::v4i32, Vec), 6252 Op.getOperand(1))); 6253 // Transform it so it match pextrw which produces a 32-bit result. 6254 EVT EltVT = MVT::i32; 6255 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6256 Op.getOperand(0), Op.getOperand(1)); 6257 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6258 DAG.getValueType(VT)); 6259 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6260 } else if (VT.getSizeInBits() == 32) { 6261 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6262 if (Idx == 0) 6263 return Op; 6264 6265 // SHUFPS the element to the lowest double word, then movss. 6266 int Mask[4] = { Idx, -1, -1, -1 }; 6267 EVT VVT = Op.getOperand(0).getValueType(); 6268 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6269 DAG.getUNDEF(VVT), Mask); 6270 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6271 DAG.getIntPtrConstant(0)); 6272 } else if (VT.getSizeInBits() == 64) { 6273 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6274 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6275 // to match extract_elt for f64. 6276 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6277 if (Idx == 0) 6278 return Op; 6279 6280 // UNPCKHPD the element to the lowest double word, then movsd. 6281 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6282 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6283 int Mask[2] = { 1, -1 }; 6284 EVT VVT = Op.getOperand(0).getValueType(); 6285 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6286 DAG.getUNDEF(VVT), Mask); 6287 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6288 DAG.getIntPtrConstant(0)); 6289 } 6290 6291 return SDValue(); 6292} 6293 6294SDValue 6295X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6296 SelectionDAG &DAG) const { 6297 EVT VT = Op.getValueType(); 6298 EVT EltVT = VT.getVectorElementType(); 6299 DebugLoc dl = Op.getDebugLoc(); 6300 6301 SDValue N0 = Op.getOperand(0); 6302 SDValue N1 = Op.getOperand(1); 6303 SDValue N2 = Op.getOperand(2); 6304 6305 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6306 isa<ConstantSDNode>(N2)) { 6307 unsigned Opc; 6308 if (VT == MVT::v8i16) 6309 Opc = X86ISD::PINSRW; 6310 else if (VT == MVT::v16i8) 6311 Opc = X86ISD::PINSRB; 6312 else 6313 Opc = X86ISD::PINSRB; 6314 6315 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6316 // argument. 6317 if (N1.getValueType() != MVT::i32) 6318 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6319 if (N2.getValueType() != MVT::i32) 6320 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6321 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6322 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6323 // Bits [7:6] of the constant are the source select. This will always be 6324 // zero here. The DAG Combiner may combine an extract_elt index into these 6325 // bits. For example (insert (extract, 3), 2) could be matched by putting 6326 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6327 // Bits [5:4] of the constant are the destination select. This is the 6328 // value of the incoming immediate. 6329 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6330 // combine either bitwise AND or insert of float 0.0 to set these bits. 6331 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6332 // Create this as a scalar to vector.. 6333 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6334 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6335 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6336 // PINSR* works with constant index. 6337 return Op; 6338 } 6339 return SDValue(); 6340} 6341 6342SDValue 6343X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6344 EVT VT = Op.getValueType(); 6345 EVT EltVT = VT.getVectorElementType(); 6346 6347 DebugLoc dl = Op.getDebugLoc(); 6348 SDValue N0 = Op.getOperand(0); 6349 SDValue N1 = Op.getOperand(1); 6350 SDValue N2 = Op.getOperand(2); 6351 6352 // If this is a 256-bit vector result, first insert into a 128-bit 6353 // vector and then insert into the 256-bit vector. 6354 if (VT.getSizeInBits() > 128) { 6355 if (!isa<ConstantSDNode>(N2)) 6356 return SDValue(); 6357 6358 // Get the 128-bit vector. 6359 unsigned NumElems = VT.getVectorNumElements(); 6360 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6361 bool Upper = IdxVal >= NumElems / 2; 6362 6363 SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl); 6364 6365 // Insert into it. 6366 SDValue ScaledN2 = N2; 6367 if (Upper) 6368 ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2, 6369 DAG.getConstant(NumElems / 6370 (VT.getSizeInBits() / 128), 6371 N2.getValueType())); 6372 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0, 6373 N1, ScaledN2); 6374 6375 // Insert the 128-bit vector 6376 // FIXME: Why UNDEF? 6377 return Insert128BitVector(N0, Op, N2, DAG, dl); 6378 } 6379 6380 if (Subtarget->hasSSE41()) 6381 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6382 6383 if (EltVT == MVT::i8) 6384 return SDValue(); 6385 6386 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6387 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6388 // as its second argument. 6389 if (N1.getValueType() != MVT::i32) 6390 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6391 if (N2.getValueType() != MVT::i32) 6392 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6393 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6394 } 6395 return SDValue(); 6396} 6397 6398SDValue 6399X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6400 LLVMContext *Context = DAG.getContext(); 6401 DebugLoc dl = Op.getDebugLoc(); 6402 EVT OpVT = Op.getValueType(); 6403 6404 // If this is a 256-bit vector result, first insert into a 128-bit 6405 // vector and then insert into the 256-bit vector. 6406 if (OpVT.getSizeInBits() > 128) { 6407 // Insert into a 128-bit vector. 6408 EVT VT128 = EVT::getVectorVT(*Context, 6409 OpVT.getVectorElementType(), 6410 OpVT.getVectorNumElements() / 2); 6411 6412 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6413 6414 // Insert the 128-bit vector. 6415 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6416 DAG.getConstant(0, MVT::i32), 6417 DAG, dl); 6418 } 6419 6420 if (Op.getValueType() == MVT::v1i64 && 6421 Op.getOperand(0).getValueType() == MVT::i64) 6422 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6423 6424 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6425 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6426 "Expected an SSE type!"); 6427 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6428 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6429} 6430 6431// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6432// a simple subregister reference or explicit instructions to grab 6433// upper bits of a vector. 6434SDValue 6435X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6436 if (Subtarget->hasAVX()) { 6437 DebugLoc dl = Op.getNode()->getDebugLoc(); 6438 SDValue Vec = Op.getNode()->getOperand(0); 6439 SDValue Idx = Op.getNode()->getOperand(1); 6440 6441 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6442 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6443 return Extract128BitVector(Vec, Idx, DAG, dl); 6444 } 6445 } 6446 return SDValue(); 6447} 6448 6449// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6450// simple superregister reference or explicit instructions to insert 6451// the upper bits of a vector. 6452SDValue 6453X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6454 if (Subtarget->hasAVX()) { 6455 DebugLoc dl = Op.getNode()->getDebugLoc(); 6456 SDValue Vec = Op.getNode()->getOperand(0); 6457 SDValue SubVec = Op.getNode()->getOperand(1); 6458 SDValue Idx = Op.getNode()->getOperand(2); 6459 6460 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6461 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6462 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6463 } 6464 } 6465 return SDValue(); 6466} 6467 6468// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6469// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6470// one of the above mentioned nodes. It has to be wrapped because otherwise 6471// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6472// be used to form addressing mode. These wrapped nodes will be selected 6473// into MOV32ri. 6474SDValue 6475X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6476 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6477 6478 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6479 // global base reg. 6480 unsigned char OpFlag = 0; 6481 unsigned WrapperKind = X86ISD::Wrapper; 6482 CodeModel::Model M = getTargetMachine().getCodeModel(); 6483 6484 if (Subtarget->isPICStyleRIPRel() && 6485 (M == CodeModel::Small || M == CodeModel::Kernel)) 6486 WrapperKind = X86ISD::WrapperRIP; 6487 else if (Subtarget->isPICStyleGOT()) 6488 OpFlag = X86II::MO_GOTOFF; 6489 else if (Subtarget->isPICStyleStubPIC()) 6490 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6491 6492 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6493 CP->getAlignment(), 6494 CP->getOffset(), OpFlag); 6495 DebugLoc DL = CP->getDebugLoc(); 6496 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6497 // With PIC, the address is actually $g + Offset. 6498 if (OpFlag) { 6499 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6500 DAG.getNode(X86ISD::GlobalBaseReg, 6501 DebugLoc(), getPointerTy()), 6502 Result); 6503 } 6504 6505 return Result; 6506} 6507 6508SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6509 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6510 6511 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6512 // global base reg. 6513 unsigned char OpFlag = 0; 6514 unsigned WrapperKind = X86ISD::Wrapper; 6515 CodeModel::Model M = getTargetMachine().getCodeModel(); 6516 6517 if (Subtarget->isPICStyleRIPRel() && 6518 (M == CodeModel::Small || M == CodeModel::Kernel)) 6519 WrapperKind = X86ISD::WrapperRIP; 6520 else if (Subtarget->isPICStyleGOT()) 6521 OpFlag = X86II::MO_GOTOFF; 6522 else if (Subtarget->isPICStyleStubPIC()) 6523 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6524 6525 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6526 OpFlag); 6527 DebugLoc DL = JT->getDebugLoc(); 6528 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6529 6530 // With PIC, the address is actually $g + Offset. 6531 if (OpFlag) 6532 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6533 DAG.getNode(X86ISD::GlobalBaseReg, 6534 DebugLoc(), getPointerTy()), 6535 Result); 6536 6537 return Result; 6538} 6539 6540SDValue 6541X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6542 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6543 6544 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6545 // global base reg. 6546 unsigned char OpFlag = 0; 6547 unsigned WrapperKind = X86ISD::Wrapper; 6548 CodeModel::Model M = getTargetMachine().getCodeModel(); 6549 6550 if (Subtarget->isPICStyleRIPRel() && 6551 (M == CodeModel::Small || M == CodeModel::Kernel)) 6552 WrapperKind = X86ISD::WrapperRIP; 6553 else if (Subtarget->isPICStyleGOT()) 6554 OpFlag = X86II::MO_GOTOFF; 6555 else if (Subtarget->isPICStyleStubPIC()) 6556 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6557 6558 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6559 6560 DebugLoc DL = Op.getDebugLoc(); 6561 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6562 6563 6564 // With PIC, the address is actually $g + Offset. 6565 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6566 !Subtarget->is64Bit()) { 6567 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6568 DAG.getNode(X86ISD::GlobalBaseReg, 6569 DebugLoc(), getPointerTy()), 6570 Result); 6571 } 6572 6573 return Result; 6574} 6575 6576SDValue 6577X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6578 // Create the TargetBlockAddressAddress node. 6579 unsigned char OpFlags = 6580 Subtarget->ClassifyBlockAddressReference(); 6581 CodeModel::Model M = getTargetMachine().getCodeModel(); 6582 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6583 DebugLoc dl = Op.getDebugLoc(); 6584 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6585 /*isTarget=*/true, OpFlags); 6586 6587 if (Subtarget->isPICStyleRIPRel() && 6588 (M == CodeModel::Small || M == CodeModel::Kernel)) 6589 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6590 else 6591 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6592 6593 // With PIC, the address is actually $g + Offset. 6594 if (isGlobalRelativeToPICBase(OpFlags)) { 6595 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6596 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6597 Result); 6598 } 6599 6600 return Result; 6601} 6602 6603SDValue 6604X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6605 int64_t Offset, 6606 SelectionDAG &DAG) const { 6607 // Create the TargetGlobalAddress node, folding in the constant 6608 // offset if it is legal. 6609 unsigned char OpFlags = 6610 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6611 CodeModel::Model M = getTargetMachine().getCodeModel(); 6612 SDValue Result; 6613 if (OpFlags == X86II::MO_NO_FLAG && 6614 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6615 // A direct static reference to a global. 6616 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6617 Offset = 0; 6618 } else { 6619 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6620 } 6621 6622 if (Subtarget->isPICStyleRIPRel() && 6623 (M == CodeModel::Small || M == CodeModel::Kernel)) 6624 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6625 else 6626 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6627 6628 // With PIC, the address is actually $g + Offset. 6629 if (isGlobalRelativeToPICBase(OpFlags)) { 6630 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6631 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6632 Result); 6633 } 6634 6635 // For globals that require a load from a stub to get the address, emit the 6636 // load. 6637 if (isGlobalStubReference(OpFlags)) 6638 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6639 MachinePointerInfo::getGOT(), false, false, 0); 6640 6641 // If there was a non-zero offset that we didn't fold, create an explicit 6642 // addition for it. 6643 if (Offset != 0) 6644 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6645 DAG.getConstant(Offset, getPointerTy())); 6646 6647 return Result; 6648} 6649 6650SDValue 6651X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6652 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6653 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6654 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6655} 6656 6657static SDValue 6658GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6659 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6660 unsigned char OperandFlags) { 6661 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6662 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6663 DebugLoc dl = GA->getDebugLoc(); 6664 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6665 GA->getValueType(0), 6666 GA->getOffset(), 6667 OperandFlags); 6668 if (InFlag) { 6669 SDValue Ops[] = { Chain, TGA, *InFlag }; 6670 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6671 } else { 6672 SDValue Ops[] = { Chain, TGA }; 6673 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6674 } 6675 6676 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6677 MFI->setAdjustsStack(true); 6678 6679 SDValue Flag = Chain.getValue(1); 6680 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6681} 6682 6683// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6684static SDValue 6685LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6686 const EVT PtrVT) { 6687 SDValue InFlag; 6688 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6689 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6690 DAG.getNode(X86ISD::GlobalBaseReg, 6691 DebugLoc(), PtrVT), InFlag); 6692 InFlag = Chain.getValue(1); 6693 6694 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6695} 6696 6697// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6698static SDValue 6699LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6700 const EVT PtrVT) { 6701 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6702 X86::RAX, X86II::MO_TLSGD); 6703} 6704 6705// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6706// "local exec" model. 6707static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6708 const EVT PtrVT, TLSModel::Model model, 6709 bool is64Bit) { 6710 DebugLoc dl = GA->getDebugLoc(); 6711 6712 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6713 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6714 is64Bit ? 257 : 256)); 6715 6716 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6717 DAG.getIntPtrConstant(0), 6718 MachinePointerInfo(Ptr), false, false, 0); 6719 6720 unsigned char OperandFlags = 0; 6721 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6722 // initialexec. 6723 unsigned WrapperKind = X86ISD::Wrapper; 6724 if (model == TLSModel::LocalExec) { 6725 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6726 } else if (is64Bit) { 6727 assert(model == TLSModel::InitialExec); 6728 OperandFlags = X86II::MO_GOTTPOFF; 6729 WrapperKind = X86ISD::WrapperRIP; 6730 } else { 6731 assert(model == TLSModel::InitialExec); 6732 OperandFlags = X86II::MO_INDNTPOFF; 6733 } 6734 6735 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6736 // exec) 6737 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6738 GA->getValueType(0), 6739 GA->getOffset(), OperandFlags); 6740 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6741 6742 if (model == TLSModel::InitialExec) 6743 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6744 MachinePointerInfo::getGOT(), false, false, 0); 6745 6746 // The address of the thread local variable is the add of the thread 6747 // pointer with the offset of the variable. 6748 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6749} 6750 6751SDValue 6752X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6753 6754 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6755 const GlobalValue *GV = GA->getGlobal(); 6756 6757 if (Subtarget->isTargetELF()) { 6758 // TODO: implement the "local dynamic" model 6759 // TODO: implement the "initial exec"model for pic executables 6760 6761 // If GV is an alias then use the aliasee for determining 6762 // thread-localness. 6763 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6764 GV = GA->resolveAliasedGlobal(false); 6765 6766 TLSModel::Model model 6767 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6768 6769 switch (model) { 6770 case TLSModel::GeneralDynamic: 6771 case TLSModel::LocalDynamic: // not implemented 6772 if (Subtarget->is64Bit()) 6773 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6774 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6775 6776 case TLSModel::InitialExec: 6777 case TLSModel::LocalExec: 6778 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6779 Subtarget->is64Bit()); 6780 } 6781 } else if (Subtarget->isTargetDarwin()) { 6782 // Darwin only has one model of TLS. Lower to that. 6783 unsigned char OpFlag = 0; 6784 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6785 X86ISD::WrapperRIP : X86ISD::Wrapper; 6786 6787 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6788 // global base reg. 6789 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6790 !Subtarget->is64Bit(); 6791 if (PIC32) 6792 OpFlag = X86II::MO_TLVP_PIC_BASE; 6793 else 6794 OpFlag = X86II::MO_TLVP; 6795 DebugLoc DL = Op.getDebugLoc(); 6796 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6797 GA->getValueType(0), 6798 GA->getOffset(), OpFlag); 6799 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6800 6801 // With PIC32, the address is actually $g + Offset. 6802 if (PIC32) 6803 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6804 DAG.getNode(X86ISD::GlobalBaseReg, 6805 DebugLoc(), getPointerTy()), 6806 Offset); 6807 6808 // Lowering the machine isd will make sure everything is in the right 6809 // location. 6810 SDValue Chain = DAG.getEntryNode(); 6811 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6812 SDValue Args[] = { Chain, Offset }; 6813 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6814 6815 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6816 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6817 MFI->setAdjustsStack(true); 6818 6819 // And our return value (tls address) is in the standard call return value 6820 // location. 6821 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6822 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6823 } 6824 6825 assert(false && 6826 "TLS not implemented for this target."); 6827 6828 llvm_unreachable("Unreachable"); 6829 return SDValue(); 6830} 6831 6832 6833/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 6834/// take a 2 x i32 value to shift plus a shift amount. 6835SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 6836 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6837 EVT VT = Op.getValueType(); 6838 unsigned VTBits = VT.getSizeInBits(); 6839 DebugLoc dl = Op.getDebugLoc(); 6840 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6841 SDValue ShOpLo = Op.getOperand(0); 6842 SDValue ShOpHi = Op.getOperand(1); 6843 SDValue ShAmt = Op.getOperand(2); 6844 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6845 DAG.getConstant(VTBits - 1, MVT::i8)) 6846 : DAG.getConstant(0, VT); 6847 6848 SDValue Tmp2, Tmp3; 6849 if (Op.getOpcode() == ISD::SHL_PARTS) { 6850 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6851 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6852 } else { 6853 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6854 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6855 } 6856 6857 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6858 DAG.getConstant(VTBits, MVT::i8)); 6859 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6860 AndNode, DAG.getConstant(0, MVT::i8)); 6861 6862 SDValue Hi, Lo; 6863 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6864 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6865 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6866 6867 if (Op.getOpcode() == ISD::SHL_PARTS) { 6868 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6869 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6870 } else { 6871 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6872 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6873 } 6874 6875 SDValue Ops[2] = { Lo, Hi }; 6876 return DAG.getMergeValues(Ops, 2, dl); 6877} 6878 6879SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6880 SelectionDAG &DAG) const { 6881 EVT SrcVT = Op.getOperand(0).getValueType(); 6882 6883 if (SrcVT.isVector()) 6884 return SDValue(); 6885 6886 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6887 "Unknown SINT_TO_FP to lower!"); 6888 6889 // These are really Legal; return the operand so the caller accepts it as 6890 // Legal. 6891 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6892 return Op; 6893 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6894 Subtarget->is64Bit()) { 6895 return Op; 6896 } 6897 6898 DebugLoc dl = Op.getDebugLoc(); 6899 unsigned Size = SrcVT.getSizeInBits()/8; 6900 MachineFunction &MF = DAG.getMachineFunction(); 6901 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6902 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6903 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6904 StackSlot, 6905 MachinePointerInfo::getFixedStack(SSFI), 6906 false, false, 0); 6907 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6908} 6909 6910SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6911 SDValue StackSlot, 6912 SelectionDAG &DAG) const { 6913 // Build the FILD 6914 DebugLoc DL = Op.getDebugLoc(); 6915 SDVTList Tys; 6916 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6917 if (useSSE) 6918 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 6919 else 6920 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6921 6922 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6923 6924 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 6925 MachineMemOperand *MMO; 6926 if (FI) { 6927 int SSFI = FI->getIndex(); 6928 MMO = 6929 DAG.getMachineFunction() 6930 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6931 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6932 } else { 6933 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 6934 StackSlot = StackSlot.getOperand(1); 6935 } 6936 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6937 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6938 X86ISD::FILD, DL, 6939 Tys, Ops, array_lengthof(Ops), 6940 SrcVT, MMO); 6941 6942 if (useSSE) { 6943 Chain = Result.getValue(1); 6944 SDValue InFlag = Result.getValue(2); 6945 6946 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6947 // shouldn't be necessary except that RFP cannot be live across 6948 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6949 MachineFunction &MF = DAG.getMachineFunction(); 6950 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6951 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6952 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6953 Tys = DAG.getVTList(MVT::Other); 6954 SDValue Ops[] = { 6955 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6956 }; 6957 MachineMemOperand *MMO = 6958 DAG.getMachineFunction() 6959 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6960 MachineMemOperand::MOStore, SSFISize, SSFISize); 6961 6962 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6963 Ops, array_lengthof(Ops), 6964 Op.getValueType(), MMO); 6965 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6966 MachinePointerInfo::getFixedStack(SSFI), 6967 false, false, 0); 6968 } 6969 6970 return Result; 6971} 6972 6973// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6974SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6975 SelectionDAG &DAG) const { 6976 // This algorithm is not obvious. Here it is in C code, more or less: 6977 /* 6978 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6979 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6980 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6981 6982 // Copy ints to xmm registers. 6983 __m128i xh = _mm_cvtsi32_si128( hi ); 6984 __m128i xl = _mm_cvtsi32_si128( lo ); 6985 6986 // Combine into low half of a single xmm register. 6987 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6988 __m128d d; 6989 double sd; 6990 6991 // Merge in appropriate exponents to give the integer bits the right 6992 // magnitude. 6993 x = _mm_unpacklo_epi32( x, exp ); 6994 6995 // Subtract away the biases to deal with the IEEE-754 double precision 6996 // implicit 1. 6997 d = _mm_sub_pd( (__m128d) x, bias ); 6998 6999 // All conversions up to here are exact. The correctly rounded result is 7000 // calculated using the current rounding mode using the following 7001 // horizontal add. 7002 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7003 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7004 // store doesn't really need to be here (except 7005 // maybe to zero the other double) 7006 return sd; 7007 } 7008 */ 7009 7010 DebugLoc dl = Op.getDebugLoc(); 7011 LLVMContext *Context = DAG.getContext(); 7012 7013 // Build some magic constants. 7014 std::vector<Constant*> CV0; 7015 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7016 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7017 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7018 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7019 Constant *C0 = ConstantVector::get(CV0); 7020 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7021 7022 std::vector<Constant*> CV1; 7023 CV1.push_back( 7024 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7025 CV1.push_back( 7026 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7027 Constant *C1 = ConstantVector::get(CV1); 7028 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7029 7030 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7031 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7032 Op.getOperand(0), 7033 DAG.getIntPtrConstant(1))); 7034 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7035 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7036 Op.getOperand(0), 7037 DAG.getIntPtrConstant(0))); 7038 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7039 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7040 MachinePointerInfo::getConstantPool(), 7041 false, false, 16); 7042 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7043 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7044 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7045 MachinePointerInfo::getConstantPool(), 7046 false, false, 16); 7047 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7048 7049 // Add the halves; easiest way is to swap them into another reg first. 7050 int ShufMask[2] = { 1, -1 }; 7051 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7052 DAG.getUNDEF(MVT::v2f64), ShufMask); 7053 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7054 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7055 DAG.getIntPtrConstant(0)); 7056} 7057 7058// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7059SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7060 SelectionDAG &DAG) const { 7061 DebugLoc dl = Op.getDebugLoc(); 7062 // FP constant to bias correct the final result. 7063 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7064 MVT::f64); 7065 7066 // Load the 32-bit value into an XMM register. 7067 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7068 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7069 Op.getOperand(0), 7070 DAG.getIntPtrConstant(0))); 7071 7072 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7073 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7074 DAG.getIntPtrConstant(0)); 7075 7076 // Or the load with the bias. 7077 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7078 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7079 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7080 MVT::v2f64, Load)), 7081 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7082 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7083 MVT::v2f64, Bias))); 7084 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7085 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7086 DAG.getIntPtrConstant(0)); 7087 7088 // Subtract the bias. 7089 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7090 7091 // Handle final rounding. 7092 EVT DestVT = Op.getValueType(); 7093 7094 if (DestVT.bitsLT(MVT::f64)) { 7095 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7096 DAG.getIntPtrConstant(0)); 7097 } else if (DestVT.bitsGT(MVT::f64)) { 7098 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7099 } 7100 7101 // Handle final rounding. 7102 return Sub; 7103} 7104 7105SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7106 SelectionDAG &DAG) const { 7107 SDValue N0 = Op.getOperand(0); 7108 DebugLoc dl = Op.getDebugLoc(); 7109 7110 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7111 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7112 // the optimization here. 7113 if (DAG.SignBitIsZero(N0)) 7114 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7115 7116 EVT SrcVT = N0.getValueType(); 7117 EVT DstVT = Op.getValueType(); 7118 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7119 return LowerUINT_TO_FP_i64(Op, DAG); 7120 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7121 return LowerUINT_TO_FP_i32(Op, DAG); 7122 7123 // Make a 64-bit buffer, and use it to build an FILD. 7124 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7125 if (SrcVT == MVT::i32) { 7126 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7127 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7128 getPointerTy(), StackSlot, WordOff); 7129 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7130 StackSlot, MachinePointerInfo(), 7131 false, false, 0); 7132 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7133 OffsetSlot, MachinePointerInfo(), 7134 false, false, 0); 7135 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7136 return Fild; 7137 } 7138 7139 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7140 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7141 StackSlot, MachinePointerInfo(), 7142 false, false, 0); 7143 // For i64 source, we need to add the appropriate power of 2 if the input 7144 // was negative. This is the same as the optimization in 7145 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7146 // we must be careful to do the computation in x87 extended precision, not 7147 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7148 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7149 MachineMemOperand *MMO = 7150 DAG.getMachineFunction() 7151 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7152 MachineMemOperand::MOLoad, 8, 8); 7153 7154 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7155 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7156 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7157 MVT::i64, MMO); 7158 7159 APInt FF(32, 0x5F800000ULL); 7160 7161 // Check whether the sign bit is set. 7162 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7163 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7164 ISD::SETLT); 7165 7166 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7167 SDValue FudgePtr = DAG.getConstantPool( 7168 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7169 getPointerTy()); 7170 7171 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7172 SDValue Zero = DAG.getIntPtrConstant(0); 7173 SDValue Four = DAG.getIntPtrConstant(4); 7174 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7175 Zero, Four); 7176 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7177 7178 // Load the value out, extending it from f32 to f80. 7179 // FIXME: Avoid the extend by constructing the right constant pool? 7180 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7181 FudgePtr, MachinePointerInfo::getConstantPool(), 7182 MVT::f32, false, false, 4); 7183 // Extend everything to 80 bits to force it to be done on x87. 7184 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7185 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7186} 7187 7188std::pair<SDValue,SDValue> X86TargetLowering:: 7189FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7190 DebugLoc DL = Op.getDebugLoc(); 7191 7192 EVT DstTy = Op.getValueType(); 7193 7194 if (!IsSigned) { 7195 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7196 DstTy = MVT::i64; 7197 } 7198 7199 assert(DstTy.getSimpleVT() <= MVT::i64 && 7200 DstTy.getSimpleVT() >= MVT::i16 && 7201 "Unknown FP_TO_SINT to lower!"); 7202 7203 // These are really Legal. 7204 if (DstTy == MVT::i32 && 7205 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7206 return std::make_pair(SDValue(), SDValue()); 7207 if (Subtarget->is64Bit() && 7208 DstTy == MVT::i64 && 7209 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7210 return std::make_pair(SDValue(), SDValue()); 7211 7212 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7213 // stack slot. 7214 MachineFunction &MF = DAG.getMachineFunction(); 7215 unsigned MemSize = DstTy.getSizeInBits()/8; 7216 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7217 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7218 7219 7220 7221 unsigned Opc; 7222 switch (DstTy.getSimpleVT().SimpleTy) { 7223 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7224 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7225 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7226 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7227 } 7228 7229 SDValue Chain = DAG.getEntryNode(); 7230 SDValue Value = Op.getOperand(0); 7231 EVT TheVT = Op.getOperand(0).getValueType(); 7232 if (isScalarFPTypeInSSEReg(TheVT)) { 7233 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7234 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7235 MachinePointerInfo::getFixedStack(SSFI), 7236 false, false, 0); 7237 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7238 SDValue Ops[] = { 7239 Chain, StackSlot, DAG.getValueType(TheVT) 7240 }; 7241 7242 MachineMemOperand *MMO = 7243 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7244 MachineMemOperand::MOLoad, MemSize, MemSize); 7245 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7246 DstTy, MMO); 7247 Chain = Value.getValue(1); 7248 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7249 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7250 } 7251 7252 MachineMemOperand *MMO = 7253 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7254 MachineMemOperand::MOStore, MemSize, MemSize); 7255 7256 // Build the FP_TO_INT*_IN_MEM 7257 SDValue Ops[] = { Chain, Value, StackSlot }; 7258 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7259 Ops, 3, DstTy, MMO); 7260 7261 return std::make_pair(FIST, StackSlot); 7262} 7263 7264SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7265 SelectionDAG &DAG) const { 7266 if (Op.getValueType().isVector()) 7267 return SDValue(); 7268 7269 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7270 SDValue FIST = Vals.first, StackSlot = Vals.second; 7271 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7272 if (FIST.getNode() == 0) return Op; 7273 7274 // Load the result. 7275 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7276 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7277} 7278 7279SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7280 SelectionDAG &DAG) const { 7281 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7282 SDValue FIST = Vals.first, StackSlot = Vals.second; 7283 assert(FIST.getNode() && "Unexpected failure"); 7284 7285 // Load the result. 7286 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7287 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7288} 7289 7290SDValue X86TargetLowering::LowerFABS(SDValue Op, 7291 SelectionDAG &DAG) const { 7292 LLVMContext *Context = DAG.getContext(); 7293 DebugLoc dl = Op.getDebugLoc(); 7294 EVT VT = Op.getValueType(); 7295 EVT EltVT = VT; 7296 if (VT.isVector()) 7297 EltVT = VT.getVectorElementType(); 7298 std::vector<Constant*> CV; 7299 if (EltVT == MVT::f64) { 7300 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7301 CV.push_back(C); 7302 CV.push_back(C); 7303 } else { 7304 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7305 CV.push_back(C); 7306 CV.push_back(C); 7307 CV.push_back(C); 7308 CV.push_back(C); 7309 } 7310 Constant *C = ConstantVector::get(CV); 7311 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7312 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7313 MachinePointerInfo::getConstantPool(), 7314 false, false, 16); 7315 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7316} 7317 7318SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7319 LLVMContext *Context = DAG.getContext(); 7320 DebugLoc dl = Op.getDebugLoc(); 7321 EVT VT = Op.getValueType(); 7322 EVT EltVT = VT; 7323 if (VT.isVector()) 7324 EltVT = VT.getVectorElementType(); 7325 std::vector<Constant*> CV; 7326 if (EltVT == MVT::f64) { 7327 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7328 CV.push_back(C); 7329 CV.push_back(C); 7330 } else { 7331 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7332 CV.push_back(C); 7333 CV.push_back(C); 7334 CV.push_back(C); 7335 CV.push_back(C); 7336 } 7337 Constant *C = ConstantVector::get(CV); 7338 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7339 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7340 MachinePointerInfo::getConstantPool(), 7341 false, false, 16); 7342 if (VT.isVector()) { 7343 return DAG.getNode(ISD::BITCAST, dl, VT, 7344 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7345 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7346 Op.getOperand(0)), 7347 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7348 } else { 7349 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7350 } 7351} 7352 7353SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7354 LLVMContext *Context = DAG.getContext(); 7355 SDValue Op0 = Op.getOperand(0); 7356 SDValue Op1 = Op.getOperand(1); 7357 DebugLoc dl = Op.getDebugLoc(); 7358 EVT VT = Op.getValueType(); 7359 EVT SrcVT = Op1.getValueType(); 7360 7361 // If second operand is smaller, extend it first. 7362 if (SrcVT.bitsLT(VT)) { 7363 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7364 SrcVT = VT; 7365 } 7366 // And if it is bigger, shrink it first. 7367 if (SrcVT.bitsGT(VT)) { 7368 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7369 SrcVT = VT; 7370 } 7371 7372 // At this point the operands and the result should have the same 7373 // type, and that won't be f80 since that is not custom lowered. 7374 7375 // First get the sign bit of second operand. 7376 std::vector<Constant*> CV; 7377 if (SrcVT == MVT::f64) { 7378 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7379 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7380 } else { 7381 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7382 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7383 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7384 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7385 } 7386 Constant *C = ConstantVector::get(CV); 7387 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7388 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7389 MachinePointerInfo::getConstantPool(), 7390 false, false, 16); 7391 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7392 7393 // Shift sign bit right or left if the two operands have different types. 7394 if (SrcVT.bitsGT(VT)) { 7395 // Op0 is MVT::f32, Op1 is MVT::f64. 7396 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7397 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7398 DAG.getConstant(32, MVT::i32)); 7399 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7400 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7401 DAG.getIntPtrConstant(0)); 7402 } 7403 7404 // Clear first operand sign bit. 7405 CV.clear(); 7406 if (VT == MVT::f64) { 7407 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7408 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7409 } else { 7410 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7411 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7412 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7413 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7414 } 7415 C = ConstantVector::get(CV); 7416 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7417 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7418 MachinePointerInfo::getConstantPool(), 7419 false, false, 16); 7420 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7421 7422 // Or the value with the sign bit. 7423 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7424} 7425 7426SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 7427 SDValue N0 = Op.getOperand(0); 7428 DebugLoc dl = Op.getDebugLoc(); 7429 EVT VT = Op.getValueType(); 7430 7431 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 7432 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 7433 DAG.getConstant(1, VT)); 7434 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 7435} 7436 7437/// Emit nodes that will be selected as "test Op0,Op0", or something 7438/// equivalent. 7439SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7440 SelectionDAG &DAG) const { 7441 DebugLoc dl = Op.getDebugLoc(); 7442 7443 // CF and OF aren't always set the way we want. Determine which 7444 // of these we need. 7445 bool NeedCF = false; 7446 bool NeedOF = false; 7447 switch (X86CC) { 7448 default: break; 7449 case X86::COND_A: case X86::COND_AE: 7450 case X86::COND_B: case X86::COND_BE: 7451 NeedCF = true; 7452 break; 7453 case X86::COND_G: case X86::COND_GE: 7454 case X86::COND_L: case X86::COND_LE: 7455 case X86::COND_O: case X86::COND_NO: 7456 NeedOF = true; 7457 break; 7458 } 7459 7460 // See if we can use the EFLAGS value from the operand instead of 7461 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7462 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7463 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7464 // Emit a CMP with 0, which is the TEST pattern. 7465 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7466 DAG.getConstant(0, Op.getValueType())); 7467 7468 unsigned Opcode = 0; 7469 unsigned NumOperands = 0; 7470 switch (Op.getNode()->getOpcode()) { 7471 case ISD::ADD: 7472 // Due to an isel shortcoming, be conservative if this add is likely to be 7473 // selected as part of a load-modify-store instruction. When the root node 7474 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7475 // uses of other nodes in the match, such as the ADD in this case. This 7476 // leads to the ADD being left around and reselected, with the result being 7477 // two adds in the output. Alas, even if none our users are stores, that 7478 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7479 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7480 // climbing the DAG back to the root, and it doesn't seem to be worth the 7481 // effort. 7482 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7483 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7484 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7485 goto default_case; 7486 7487 if (ConstantSDNode *C = 7488 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7489 // An add of one will be selected as an INC. 7490 if (C->getAPIntValue() == 1) { 7491 Opcode = X86ISD::INC; 7492 NumOperands = 1; 7493 break; 7494 } 7495 7496 // An add of negative one (subtract of one) will be selected as a DEC. 7497 if (C->getAPIntValue().isAllOnesValue()) { 7498 Opcode = X86ISD::DEC; 7499 NumOperands = 1; 7500 break; 7501 } 7502 } 7503 7504 // Otherwise use a regular EFLAGS-setting add. 7505 Opcode = X86ISD::ADD; 7506 NumOperands = 2; 7507 break; 7508 case ISD::AND: { 7509 // If the primary and result isn't used, don't bother using X86ISD::AND, 7510 // because a TEST instruction will be better. 7511 bool NonFlagUse = false; 7512 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7513 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7514 SDNode *User = *UI; 7515 unsigned UOpNo = UI.getOperandNo(); 7516 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7517 // Look pass truncate. 7518 UOpNo = User->use_begin().getOperandNo(); 7519 User = *User->use_begin(); 7520 } 7521 7522 if (User->getOpcode() != ISD::BRCOND && 7523 User->getOpcode() != ISD::SETCC && 7524 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7525 NonFlagUse = true; 7526 break; 7527 } 7528 } 7529 7530 if (!NonFlagUse) 7531 break; 7532 } 7533 // FALL THROUGH 7534 case ISD::SUB: 7535 case ISD::OR: 7536 case ISD::XOR: 7537 // Due to the ISEL shortcoming noted above, be conservative if this op is 7538 // likely to be selected as part of a load-modify-store instruction. 7539 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7540 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7541 if (UI->getOpcode() == ISD::STORE) 7542 goto default_case; 7543 7544 // Otherwise use a regular EFLAGS-setting instruction. 7545 switch (Op.getNode()->getOpcode()) { 7546 default: llvm_unreachable("unexpected operator!"); 7547 case ISD::SUB: Opcode = X86ISD::SUB; break; 7548 case ISD::OR: Opcode = X86ISD::OR; break; 7549 case ISD::XOR: Opcode = X86ISD::XOR; break; 7550 case ISD::AND: Opcode = X86ISD::AND; break; 7551 } 7552 7553 NumOperands = 2; 7554 break; 7555 case X86ISD::ADD: 7556 case X86ISD::SUB: 7557 case X86ISD::INC: 7558 case X86ISD::DEC: 7559 case X86ISD::OR: 7560 case X86ISD::XOR: 7561 case X86ISD::AND: 7562 return SDValue(Op.getNode(), 1); 7563 default: 7564 default_case: 7565 break; 7566 } 7567 7568 if (Opcode == 0) 7569 // Emit a CMP with 0, which is the TEST pattern. 7570 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7571 DAG.getConstant(0, Op.getValueType())); 7572 7573 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7574 SmallVector<SDValue, 4> Ops; 7575 for (unsigned i = 0; i != NumOperands; ++i) 7576 Ops.push_back(Op.getOperand(i)); 7577 7578 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7579 DAG.ReplaceAllUsesWith(Op, New); 7580 return SDValue(New.getNode(), 1); 7581} 7582 7583/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7584/// equivalent. 7585SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7586 SelectionDAG &DAG) const { 7587 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7588 if (C->getAPIntValue() == 0) 7589 return EmitTest(Op0, X86CC, DAG); 7590 7591 DebugLoc dl = Op0.getDebugLoc(); 7592 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7593} 7594 7595/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7596/// if it's possible. 7597SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7598 DebugLoc dl, SelectionDAG &DAG) const { 7599 SDValue Op0 = And.getOperand(0); 7600 SDValue Op1 = And.getOperand(1); 7601 if (Op0.getOpcode() == ISD::TRUNCATE) 7602 Op0 = Op0.getOperand(0); 7603 if (Op1.getOpcode() == ISD::TRUNCATE) 7604 Op1 = Op1.getOperand(0); 7605 7606 SDValue LHS, RHS; 7607 if (Op1.getOpcode() == ISD::SHL) 7608 std::swap(Op0, Op1); 7609 if (Op0.getOpcode() == ISD::SHL) { 7610 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7611 if (And00C->getZExtValue() == 1) { 7612 // If we looked past a truncate, check that it's only truncating away 7613 // known zeros. 7614 unsigned BitWidth = Op0.getValueSizeInBits(); 7615 unsigned AndBitWidth = And.getValueSizeInBits(); 7616 if (BitWidth > AndBitWidth) { 7617 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7618 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7619 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7620 return SDValue(); 7621 } 7622 LHS = Op1; 7623 RHS = Op0.getOperand(1); 7624 } 7625 } else if (Op1.getOpcode() == ISD::Constant) { 7626 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7627 SDValue AndLHS = Op0; 7628 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7629 LHS = AndLHS.getOperand(0); 7630 RHS = AndLHS.getOperand(1); 7631 } 7632 } 7633 7634 if (LHS.getNode()) { 7635 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7636 // instruction. Since the shift amount is in-range-or-undefined, we know 7637 // that doing a bittest on the i32 value is ok. We extend to i32 because 7638 // the encoding for the i16 version is larger than the i32 version. 7639 // Also promote i16 to i32 for performance / code size reason. 7640 if (LHS.getValueType() == MVT::i8 || 7641 LHS.getValueType() == MVT::i16) 7642 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7643 7644 // If the operand types disagree, extend the shift amount to match. Since 7645 // BT ignores high bits (like shifts) we can use anyextend. 7646 if (LHS.getValueType() != RHS.getValueType()) 7647 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7648 7649 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7650 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7651 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7652 DAG.getConstant(Cond, MVT::i8), BT); 7653 } 7654 7655 return SDValue(); 7656} 7657 7658SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7659 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7660 SDValue Op0 = Op.getOperand(0); 7661 SDValue Op1 = Op.getOperand(1); 7662 DebugLoc dl = Op.getDebugLoc(); 7663 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7664 7665 // Optimize to BT if possible. 7666 // Lower (X & (1 << N)) == 0 to BT(X, N). 7667 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7668 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7669 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7670 Op1.getOpcode() == ISD::Constant && 7671 cast<ConstantSDNode>(Op1)->isNullValue() && 7672 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7673 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7674 if (NewSetCC.getNode()) 7675 return NewSetCC; 7676 } 7677 7678 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7679 // these. 7680 if (Op1.getOpcode() == ISD::Constant && 7681 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7682 cast<ConstantSDNode>(Op1)->isNullValue()) && 7683 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7684 7685 // If the input is a setcc, then reuse the input setcc or use a new one with 7686 // the inverted condition. 7687 if (Op0.getOpcode() == X86ISD::SETCC) { 7688 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7689 bool Invert = (CC == ISD::SETNE) ^ 7690 cast<ConstantSDNode>(Op1)->isNullValue(); 7691 if (!Invert) return Op0; 7692 7693 CCode = X86::GetOppositeBranchCondition(CCode); 7694 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7695 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7696 } 7697 } 7698 7699 bool isFP = Op1.getValueType().isFloatingPoint(); 7700 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7701 if (X86CC == X86::COND_INVALID) 7702 return SDValue(); 7703 7704 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7705 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7706 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7707} 7708 7709SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7710 SDValue Cond; 7711 SDValue Op0 = Op.getOperand(0); 7712 SDValue Op1 = Op.getOperand(1); 7713 SDValue CC = Op.getOperand(2); 7714 EVT VT = Op.getValueType(); 7715 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7716 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7717 DebugLoc dl = Op.getDebugLoc(); 7718 7719 if (isFP) { 7720 unsigned SSECC = 8; 7721 EVT VT0 = Op0.getValueType(); 7722 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7723 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7724 bool Swap = false; 7725 7726 switch (SetCCOpcode) { 7727 default: break; 7728 case ISD::SETOEQ: 7729 case ISD::SETEQ: SSECC = 0; break; 7730 case ISD::SETOGT: 7731 case ISD::SETGT: Swap = true; // Fallthrough 7732 case ISD::SETLT: 7733 case ISD::SETOLT: SSECC = 1; break; 7734 case ISD::SETOGE: 7735 case ISD::SETGE: Swap = true; // Fallthrough 7736 case ISD::SETLE: 7737 case ISD::SETOLE: SSECC = 2; break; 7738 case ISD::SETUO: SSECC = 3; break; 7739 case ISD::SETUNE: 7740 case ISD::SETNE: SSECC = 4; break; 7741 case ISD::SETULE: Swap = true; 7742 case ISD::SETUGE: SSECC = 5; break; 7743 case ISD::SETULT: Swap = true; 7744 case ISD::SETUGT: SSECC = 6; break; 7745 case ISD::SETO: SSECC = 7; break; 7746 } 7747 if (Swap) 7748 std::swap(Op0, Op1); 7749 7750 // In the two special cases we can't handle, emit two comparisons. 7751 if (SSECC == 8) { 7752 if (SetCCOpcode == ISD::SETUEQ) { 7753 SDValue UNORD, EQ; 7754 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7755 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7756 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7757 } 7758 else if (SetCCOpcode == ISD::SETONE) { 7759 SDValue ORD, NEQ; 7760 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7761 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7762 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7763 } 7764 llvm_unreachable("Illegal FP comparison"); 7765 } 7766 // Handle all other FP comparisons here. 7767 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7768 } 7769 7770 // We are handling one of the integer comparisons here. Since SSE only has 7771 // GT and EQ comparisons for integer, swapping operands and multiple 7772 // operations may be required for some comparisons. 7773 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7774 bool Swap = false, Invert = false, FlipSigns = false; 7775 7776 switch (VT.getSimpleVT().SimpleTy) { 7777 default: break; 7778 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7779 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7780 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7781 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7782 } 7783 7784 switch (SetCCOpcode) { 7785 default: break; 7786 case ISD::SETNE: Invert = true; 7787 case ISD::SETEQ: Opc = EQOpc; break; 7788 case ISD::SETLT: Swap = true; 7789 case ISD::SETGT: Opc = GTOpc; break; 7790 case ISD::SETGE: Swap = true; 7791 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7792 case ISD::SETULT: Swap = true; 7793 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7794 case ISD::SETUGE: Swap = true; 7795 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7796 } 7797 if (Swap) 7798 std::swap(Op0, Op1); 7799 7800 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7801 // bits of the inputs before performing those operations. 7802 if (FlipSigns) { 7803 EVT EltVT = VT.getVectorElementType(); 7804 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7805 EltVT); 7806 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7807 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7808 SignBits.size()); 7809 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7810 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7811 } 7812 7813 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7814 7815 // If the logical-not of the result is required, perform that now. 7816 if (Invert) 7817 Result = DAG.getNOT(dl, Result, VT); 7818 7819 return Result; 7820} 7821 7822// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7823static bool isX86LogicalCmp(SDValue Op) { 7824 unsigned Opc = Op.getNode()->getOpcode(); 7825 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7826 return true; 7827 if (Op.getResNo() == 1 && 7828 (Opc == X86ISD::ADD || 7829 Opc == X86ISD::SUB || 7830 Opc == X86ISD::ADC || 7831 Opc == X86ISD::SBB || 7832 Opc == X86ISD::SMUL || 7833 Opc == X86ISD::UMUL || 7834 Opc == X86ISD::INC || 7835 Opc == X86ISD::DEC || 7836 Opc == X86ISD::OR || 7837 Opc == X86ISD::XOR || 7838 Opc == X86ISD::AND)) 7839 return true; 7840 7841 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7842 return true; 7843 7844 return false; 7845} 7846 7847static bool isZero(SDValue V) { 7848 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7849 return C && C->isNullValue(); 7850} 7851 7852static bool isAllOnes(SDValue V) { 7853 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7854 return C && C->isAllOnesValue(); 7855} 7856 7857SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7858 bool addTest = true; 7859 SDValue Cond = Op.getOperand(0); 7860 SDValue Op1 = Op.getOperand(1); 7861 SDValue Op2 = Op.getOperand(2); 7862 DebugLoc DL = Op.getDebugLoc(); 7863 SDValue CC; 7864 7865 if (Cond.getOpcode() == ISD::SETCC) { 7866 SDValue NewCond = LowerSETCC(Cond, DAG); 7867 if (NewCond.getNode()) 7868 Cond = NewCond; 7869 } 7870 7871 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7872 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7873 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7874 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7875 if (Cond.getOpcode() == X86ISD::SETCC && 7876 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7877 isZero(Cond.getOperand(1).getOperand(1))) { 7878 SDValue Cmp = Cond.getOperand(1); 7879 7880 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7881 7882 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7883 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7884 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7885 7886 SDValue CmpOp0 = Cmp.getOperand(0); 7887 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7888 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7889 7890 SDValue Res = // Res = 0 or -1. 7891 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7892 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7893 7894 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7895 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7896 7897 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7898 if (N2C == 0 || !N2C->isNullValue()) 7899 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7900 return Res; 7901 } 7902 } 7903 7904 // Look past (and (setcc_carry (cmp ...)), 1). 7905 if (Cond.getOpcode() == ISD::AND && 7906 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7907 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7908 if (C && C->getAPIntValue() == 1) 7909 Cond = Cond.getOperand(0); 7910 } 7911 7912 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7913 // setting operand in place of the X86ISD::SETCC. 7914 if (Cond.getOpcode() == X86ISD::SETCC || 7915 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7916 CC = Cond.getOperand(0); 7917 7918 SDValue Cmp = Cond.getOperand(1); 7919 unsigned Opc = Cmp.getOpcode(); 7920 EVT VT = Op.getValueType(); 7921 7922 bool IllegalFPCMov = false; 7923 if (VT.isFloatingPoint() && !VT.isVector() && 7924 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7925 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7926 7927 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7928 Opc == X86ISD::BT) { // FIXME 7929 Cond = Cmp; 7930 addTest = false; 7931 } 7932 } 7933 7934 if (addTest) { 7935 // Look pass the truncate. 7936 if (Cond.getOpcode() == ISD::TRUNCATE) 7937 Cond = Cond.getOperand(0); 7938 7939 // We know the result of AND is compared against zero. Try to match 7940 // it to BT. 7941 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7942 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7943 if (NewSetCC.getNode()) { 7944 CC = NewSetCC.getOperand(0); 7945 Cond = NewSetCC.getOperand(1); 7946 addTest = false; 7947 } 7948 } 7949 } 7950 7951 if (addTest) { 7952 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7953 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7954 } 7955 7956 // a < b ? -1 : 0 -> RES = ~setcc_carry 7957 // a < b ? 0 : -1 -> RES = setcc_carry 7958 // a >= b ? -1 : 0 -> RES = setcc_carry 7959 // a >= b ? 0 : -1 -> RES = ~setcc_carry 7960 if (Cond.getOpcode() == X86ISD::CMP) { 7961 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 7962 7963 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 7964 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 7965 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7966 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 7967 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 7968 return DAG.getNOT(DL, Res, Res.getValueType()); 7969 return Res; 7970 } 7971 } 7972 7973 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7974 // condition is true. 7975 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 7976 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7977 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7978} 7979 7980// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7981// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7982// from the AND / OR. 7983static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7984 Opc = Op.getOpcode(); 7985 if (Opc != ISD::OR && Opc != ISD::AND) 7986 return false; 7987 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7988 Op.getOperand(0).hasOneUse() && 7989 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7990 Op.getOperand(1).hasOneUse()); 7991} 7992 7993// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7994// 1 and that the SETCC node has a single use. 7995static bool isXor1OfSetCC(SDValue Op) { 7996 if (Op.getOpcode() != ISD::XOR) 7997 return false; 7998 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7999 if (N1C && N1C->getAPIntValue() == 1) { 8000 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8001 Op.getOperand(0).hasOneUse(); 8002 } 8003 return false; 8004} 8005 8006SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8007 bool addTest = true; 8008 SDValue Chain = Op.getOperand(0); 8009 SDValue Cond = Op.getOperand(1); 8010 SDValue Dest = Op.getOperand(2); 8011 DebugLoc dl = Op.getDebugLoc(); 8012 SDValue CC; 8013 8014 if (Cond.getOpcode() == ISD::SETCC) { 8015 SDValue NewCond = LowerSETCC(Cond, DAG); 8016 if (NewCond.getNode()) 8017 Cond = NewCond; 8018 } 8019#if 0 8020 // FIXME: LowerXALUO doesn't handle these!! 8021 else if (Cond.getOpcode() == X86ISD::ADD || 8022 Cond.getOpcode() == X86ISD::SUB || 8023 Cond.getOpcode() == X86ISD::SMUL || 8024 Cond.getOpcode() == X86ISD::UMUL) 8025 Cond = LowerXALUO(Cond, DAG); 8026#endif 8027 8028 // Look pass (and (setcc_carry (cmp ...)), 1). 8029 if (Cond.getOpcode() == ISD::AND && 8030 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8031 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8032 if (C && C->getAPIntValue() == 1) 8033 Cond = Cond.getOperand(0); 8034 } 8035 8036 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8037 // setting operand in place of the X86ISD::SETCC. 8038 if (Cond.getOpcode() == X86ISD::SETCC || 8039 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8040 CC = Cond.getOperand(0); 8041 8042 SDValue Cmp = Cond.getOperand(1); 8043 unsigned Opc = Cmp.getOpcode(); 8044 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8045 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8046 Cond = Cmp; 8047 addTest = false; 8048 } else { 8049 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8050 default: break; 8051 case X86::COND_O: 8052 case X86::COND_B: 8053 // These can only come from an arithmetic instruction with overflow, 8054 // e.g. SADDO, UADDO. 8055 Cond = Cond.getNode()->getOperand(1); 8056 addTest = false; 8057 break; 8058 } 8059 } 8060 } else { 8061 unsigned CondOpc; 8062 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8063 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8064 if (CondOpc == ISD::OR) { 8065 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8066 // two branches instead of an explicit OR instruction with a 8067 // separate test. 8068 if (Cmp == Cond.getOperand(1).getOperand(1) && 8069 isX86LogicalCmp(Cmp)) { 8070 CC = Cond.getOperand(0).getOperand(0); 8071 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8072 Chain, Dest, CC, Cmp); 8073 CC = Cond.getOperand(1).getOperand(0); 8074 Cond = Cmp; 8075 addTest = false; 8076 } 8077 } else { // ISD::AND 8078 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8079 // two branches instead of an explicit AND instruction with a 8080 // separate test. However, we only do this if this block doesn't 8081 // have a fall-through edge, because this requires an explicit 8082 // jmp when the condition is false. 8083 if (Cmp == Cond.getOperand(1).getOperand(1) && 8084 isX86LogicalCmp(Cmp) && 8085 Op.getNode()->hasOneUse()) { 8086 X86::CondCode CCode = 8087 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8088 CCode = X86::GetOppositeBranchCondition(CCode); 8089 CC = DAG.getConstant(CCode, MVT::i8); 8090 SDNode *User = *Op.getNode()->use_begin(); 8091 // Look for an unconditional branch following this conditional branch. 8092 // We need this because we need to reverse the successors in order 8093 // to implement FCMP_OEQ. 8094 if (User->getOpcode() == ISD::BR) { 8095 SDValue FalseBB = User->getOperand(1); 8096 SDNode *NewBR = 8097 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8098 assert(NewBR == User); 8099 (void)NewBR; 8100 Dest = FalseBB; 8101 8102 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8103 Chain, Dest, CC, Cmp); 8104 X86::CondCode CCode = 8105 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8106 CCode = X86::GetOppositeBranchCondition(CCode); 8107 CC = DAG.getConstant(CCode, MVT::i8); 8108 Cond = Cmp; 8109 addTest = false; 8110 } 8111 } 8112 } 8113 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8114 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8115 // It should be transformed during dag combiner except when the condition 8116 // is set by a arithmetics with overflow node. 8117 X86::CondCode CCode = 8118 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8119 CCode = X86::GetOppositeBranchCondition(CCode); 8120 CC = DAG.getConstant(CCode, MVT::i8); 8121 Cond = Cond.getOperand(0).getOperand(1); 8122 addTest = false; 8123 } 8124 } 8125 8126 if (addTest) { 8127 // Look pass the truncate. 8128 if (Cond.getOpcode() == ISD::TRUNCATE) 8129 Cond = Cond.getOperand(0); 8130 8131 // We know the result of AND is compared against zero. Try to match 8132 // it to BT. 8133 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8134 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8135 if (NewSetCC.getNode()) { 8136 CC = NewSetCC.getOperand(0); 8137 Cond = NewSetCC.getOperand(1); 8138 addTest = false; 8139 } 8140 } 8141 } 8142 8143 if (addTest) { 8144 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8145 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8146 } 8147 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8148 Chain, Dest, CC, Cond); 8149} 8150 8151 8152// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8153// Calls to _alloca is needed to probe the stack when allocating more than 4k 8154// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8155// that the guard pages used by the OS virtual memory manager are allocated in 8156// correct sequence. 8157SDValue 8158X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8159 SelectionDAG &DAG) const { 8160 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 8161 "This should be used only on Windows targets"); 8162 assert(!Subtarget->isTargetEnvMacho()); 8163 DebugLoc dl = Op.getDebugLoc(); 8164 8165 // Get the inputs. 8166 SDValue Chain = Op.getOperand(0); 8167 SDValue Size = Op.getOperand(1); 8168 // FIXME: Ensure alignment here 8169 8170 SDValue Flag; 8171 8172 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 8173 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8174 8175 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8176 Flag = Chain.getValue(1); 8177 8178 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8179 8180 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8181 Flag = Chain.getValue(1); 8182 8183 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8184 8185 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8186 return DAG.getMergeValues(Ops1, 2, dl); 8187} 8188 8189SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8190 MachineFunction &MF = DAG.getMachineFunction(); 8191 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8192 8193 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8194 DebugLoc DL = Op.getDebugLoc(); 8195 8196 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8197 // vastart just stores the address of the VarArgsFrameIndex slot into the 8198 // memory location argument. 8199 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8200 getPointerTy()); 8201 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8202 MachinePointerInfo(SV), false, false, 0); 8203 } 8204 8205 // __va_list_tag: 8206 // gp_offset (0 - 6 * 8) 8207 // fp_offset (48 - 48 + 8 * 16) 8208 // overflow_arg_area (point to parameters coming in memory). 8209 // reg_save_area 8210 SmallVector<SDValue, 8> MemOps; 8211 SDValue FIN = Op.getOperand(1); 8212 // Store gp_offset 8213 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 8214 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 8215 MVT::i32), 8216 FIN, MachinePointerInfo(SV), false, false, 0); 8217 MemOps.push_back(Store); 8218 8219 // Store fp_offset 8220 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8221 FIN, DAG.getIntPtrConstant(4)); 8222 Store = DAG.getStore(Op.getOperand(0), DL, 8223 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8224 MVT::i32), 8225 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8226 MemOps.push_back(Store); 8227 8228 // Store ptr to overflow_arg_area 8229 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8230 FIN, DAG.getIntPtrConstant(4)); 8231 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8232 getPointerTy()); 8233 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8234 MachinePointerInfo(SV, 8), 8235 false, false, 0); 8236 MemOps.push_back(Store); 8237 8238 // Store ptr to reg_save_area. 8239 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8240 FIN, DAG.getIntPtrConstant(8)); 8241 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8242 getPointerTy()); 8243 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8244 MachinePointerInfo(SV, 16), false, false, 0); 8245 MemOps.push_back(Store); 8246 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8247 &MemOps[0], MemOps.size()); 8248} 8249 8250SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8251 assert(Subtarget->is64Bit() && 8252 "LowerVAARG only handles 64-bit va_arg!"); 8253 assert((Subtarget->isTargetLinux() || 8254 Subtarget->isTargetDarwin()) && 8255 "Unhandled target in LowerVAARG"); 8256 assert(Op.getNode()->getNumOperands() == 4); 8257 SDValue Chain = Op.getOperand(0); 8258 SDValue SrcPtr = Op.getOperand(1); 8259 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8260 unsigned Align = Op.getConstantOperandVal(3); 8261 DebugLoc dl = Op.getDebugLoc(); 8262 8263 EVT ArgVT = Op.getNode()->getValueType(0); 8264 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8265 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8266 uint8_t ArgMode; 8267 8268 // Decide which area this value should be read from. 8269 // TODO: Implement the AMD64 ABI in its entirety. This simple 8270 // selection mechanism works only for the basic types. 8271 if (ArgVT == MVT::f80) { 8272 llvm_unreachable("va_arg for f80 not yet implemented"); 8273 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8274 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8275 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8276 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8277 } else { 8278 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8279 } 8280 8281 if (ArgMode == 2) { 8282 // Sanity Check: Make sure using fp_offset makes sense. 8283 assert(!UseSoftFloat && 8284 !(DAG.getMachineFunction() 8285 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8286 Subtarget->hasXMM()); 8287 } 8288 8289 // Insert VAARG_64 node into the DAG 8290 // VAARG_64 returns two values: Variable Argument Address, Chain 8291 SmallVector<SDValue, 11> InstOps; 8292 InstOps.push_back(Chain); 8293 InstOps.push_back(SrcPtr); 8294 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8295 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8296 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8297 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8298 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8299 VTs, &InstOps[0], InstOps.size(), 8300 MVT::i64, 8301 MachinePointerInfo(SV), 8302 /*Align=*/0, 8303 /*Volatile=*/false, 8304 /*ReadMem=*/true, 8305 /*WriteMem=*/true); 8306 Chain = VAARG.getValue(1); 8307 8308 // Load the next argument and return it 8309 return DAG.getLoad(ArgVT, dl, 8310 Chain, 8311 VAARG, 8312 MachinePointerInfo(), 8313 false, false, 0); 8314} 8315 8316SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8317 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8318 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8319 SDValue Chain = Op.getOperand(0); 8320 SDValue DstPtr = Op.getOperand(1); 8321 SDValue SrcPtr = Op.getOperand(2); 8322 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8323 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8324 DebugLoc DL = Op.getDebugLoc(); 8325 8326 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8327 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8328 false, 8329 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8330} 8331 8332SDValue 8333X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8334 DebugLoc dl = Op.getDebugLoc(); 8335 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8336 switch (IntNo) { 8337 default: return SDValue(); // Don't custom lower most intrinsics. 8338 // Comparison intrinsics. 8339 case Intrinsic::x86_sse_comieq_ss: 8340 case Intrinsic::x86_sse_comilt_ss: 8341 case Intrinsic::x86_sse_comile_ss: 8342 case Intrinsic::x86_sse_comigt_ss: 8343 case Intrinsic::x86_sse_comige_ss: 8344 case Intrinsic::x86_sse_comineq_ss: 8345 case Intrinsic::x86_sse_ucomieq_ss: 8346 case Intrinsic::x86_sse_ucomilt_ss: 8347 case Intrinsic::x86_sse_ucomile_ss: 8348 case Intrinsic::x86_sse_ucomigt_ss: 8349 case Intrinsic::x86_sse_ucomige_ss: 8350 case Intrinsic::x86_sse_ucomineq_ss: 8351 case Intrinsic::x86_sse2_comieq_sd: 8352 case Intrinsic::x86_sse2_comilt_sd: 8353 case Intrinsic::x86_sse2_comile_sd: 8354 case Intrinsic::x86_sse2_comigt_sd: 8355 case Intrinsic::x86_sse2_comige_sd: 8356 case Intrinsic::x86_sse2_comineq_sd: 8357 case Intrinsic::x86_sse2_ucomieq_sd: 8358 case Intrinsic::x86_sse2_ucomilt_sd: 8359 case Intrinsic::x86_sse2_ucomile_sd: 8360 case Intrinsic::x86_sse2_ucomigt_sd: 8361 case Intrinsic::x86_sse2_ucomige_sd: 8362 case Intrinsic::x86_sse2_ucomineq_sd: { 8363 unsigned Opc = 0; 8364 ISD::CondCode CC = ISD::SETCC_INVALID; 8365 switch (IntNo) { 8366 default: break; 8367 case Intrinsic::x86_sse_comieq_ss: 8368 case Intrinsic::x86_sse2_comieq_sd: 8369 Opc = X86ISD::COMI; 8370 CC = ISD::SETEQ; 8371 break; 8372 case Intrinsic::x86_sse_comilt_ss: 8373 case Intrinsic::x86_sse2_comilt_sd: 8374 Opc = X86ISD::COMI; 8375 CC = ISD::SETLT; 8376 break; 8377 case Intrinsic::x86_sse_comile_ss: 8378 case Intrinsic::x86_sse2_comile_sd: 8379 Opc = X86ISD::COMI; 8380 CC = ISD::SETLE; 8381 break; 8382 case Intrinsic::x86_sse_comigt_ss: 8383 case Intrinsic::x86_sse2_comigt_sd: 8384 Opc = X86ISD::COMI; 8385 CC = ISD::SETGT; 8386 break; 8387 case Intrinsic::x86_sse_comige_ss: 8388 case Intrinsic::x86_sse2_comige_sd: 8389 Opc = X86ISD::COMI; 8390 CC = ISD::SETGE; 8391 break; 8392 case Intrinsic::x86_sse_comineq_ss: 8393 case Intrinsic::x86_sse2_comineq_sd: 8394 Opc = X86ISD::COMI; 8395 CC = ISD::SETNE; 8396 break; 8397 case Intrinsic::x86_sse_ucomieq_ss: 8398 case Intrinsic::x86_sse2_ucomieq_sd: 8399 Opc = X86ISD::UCOMI; 8400 CC = ISD::SETEQ; 8401 break; 8402 case Intrinsic::x86_sse_ucomilt_ss: 8403 case Intrinsic::x86_sse2_ucomilt_sd: 8404 Opc = X86ISD::UCOMI; 8405 CC = ISD::SETLT; 8406 break; 8407 case Intrinsic::x86_sse_ucomile_ss: 8408 case Intrinsic::x86_sse2_ucomile_sd: 8409 Opc = X86ISD::UCOMI; 8410 CC = ISD::SETLE; 8411 break; 8412 case Intrinsic::x86_sse_ucomigt_ss: 8413 case Intrinsic::x86_sse2_ucomigt_sd: 8414 Opc = X86ISD::UCOMI; 8415 CC = ISD::SETGT; 8416 break; 8417 case Intrinsic::x86_sse_ucomige_ss: 8418 case Intrinsic::x86_sse2_ucomige_sd: 8419 Opc = X86ISD::UCOMI; 8420 CC = ISD::SETGE; 8421 break; 8422 case Intrinsic::x86_sse_ucomineq_ss: 8423 case Intrinsic::x86_sse2_ucomineq_sd: 8424 Opc = X86ISD::UCOMI; 8425 CC = ISD::SETNE; 8426 break; 8427 } 8428 8429 SDValue LHS = Op.getOperand(1); 8430 SDValue RHS = Op.getOperand(2); 8431 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8432 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8433 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8434 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8435 DAG.getConstant(X86CC, MVT::i8), Cond); 8436 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8437 } 8438 // ptest and testp intrinsics. The intrinsic these come from are designed to 8439 // return an integer value, not just an instruction so lower it to the ptest 8440 // or testp pattern and a setcc for the result. 8441 case Intrinsic::x86_sse41_ptestz: 8442 case Intrinsic::x86_sse41_ptestc: 8443 case Intrinsic::x86_sse41_ptestnzc: 8444 case Intrinsic::x86_avx_ptestz_256: 8445 case Intrinsic::x86_avx_ptestc_256: 8446 case Intrinsic::x86_avx_ptestnzc_256: 8447 case Intrinsic::x86_avx_vtestz_ps: 8448 case Intrinsic::x86_avx_vtestc_ps: 8449 case Intrinsic::x86_avx_vtestnzc_ps: 8450 case Intrinsic::x86_avx_vtestz_pd: 8451 case Intrinsic::x86_avx_vtestc_pd: 8452 case Intrinsic::x86_avx_vtestnzc_pd: 8453 case Intrinsic::x86_avx_vtestz_ps_256: 8454 case Intrinsic::x86_avx_vtestc_ps_256: 8455 case Intrinsic::x86_avx_vtestnzc_ps_256: 8456 case Intrinsic::x86_avx_vtestz_pd_256: 8457 case Intrinsic::x86_avx_vtestc_pd_256: 8458 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8459 bool IsTestPacked = false; 8460 unsigned X86CC = 0; 8461 switch (IntNo) { 8462 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8463 case Intrinsic::x86_avx_vtestz_ps: 8464 case Intrinsic::x86_avx_vtestz_pd: 8465 case Intrinsic::x86_avx_vtestz_ps_256: 8466 case Intrinsic::x86_avx_vtestz_pd_256: 8467 IsTestPacked = true; // Fallthrough 8468 case Intrinsic::x86_sse41_ptestz: 8469 case Intrinsic::x86_avx_ptestz_256: 8470 // ZF = 1 8471 X86CC = X86::COND_E; 8472 break; 8473 case Intrinsic::x86_avx_vtestc_ps: 8474 case Intrinsic::x86_avx_vtestc_pd: 8475 case Intrinsic::x86_avx_vtestc_ps_256: 8476 case Intrinsic::x86_avx_vtestc_pd_256: 8477 IsTestPacked = true; // Fallthrough 8478 case Intrinsic::x86_sse41_ptestc: 8479 case Intrinsic::x86_avx_ptestc_256: 8480 // CF = 1 8481 X86CC = X86::COND_B; 8482 break; 8483 case Intrinsic::x86_avx_vtestnzc_ps: 8484 case Intrinsic::x86_avx_vtestnzc_pd: 8485 case Intrinsic::x86_avx_vtestnzc_ps_256: 8486 case Intrinsic::x86_avx_vtestnzc_pd_256: 8487 IsTestPacked = true; // Fallthrough 8488 case Intrinsic::x86_sse41_ptestnzc: 8489 case Intrinsic::x86_avx_ptestnzc_256: 8490 // ZF and CF = 0 8491 X86CC = X86::COND_A; 8492 break; 8493 } 8494 8495 SDValue LHS = Op.getOperand(1); 8496 SDValue RHS = Op.getOperand(2); 8497 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8498 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8499 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8500 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8501 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8502 } 8503 8504 // Fix vector shift instructions where the last operand is a non-immediate 8505 // i32 value. 8506 case Intrinsic::x86_sse2_pslli_w: 8507 case Intrinsic::x86_sse2_pslli_d: 8508 case Intrinsic::x86_sse2_pslli_q: 8509 case Intrinsic::x86_sse2_psrli_w: 8510 case Intrinsic::x86_sse2_psrli_d: 8511 case Intrinsic::x86_sse2_psrli_q: 8512 case Intrinsic::x86_sse2_psrai_w: 8513 case Intrinsic::x86_sse2_psrai_d: 8514 case Intrinsic::x86_mmx_pslli_w: 8515 case Intrinsic::x86_mmx_pslli_d: 8516 case Intrinsic::x86_mmx_pslli_q: 8517 case Intrinsic::x86_mmx_psrli_w: 8518 case Intrinsic::x86_mmx_psrli_d: 8519 case Intrinsic::x86_mmx_psrli_q: 8520 case Intrinsic::x86_mmx_psrai_w: 8521 case Intrinsic::x86_mmx_psrai_d: { 8522 SDValue ShAmt = Op.getOperand(2); 8523 if (isa<ConstantSDNode>(ShAmt)) 8524 return SDValue(); 8525 8526 unsigned NewIntNo = 0; 8527 EVT ShAmtVT = MVT::v4i32; 8528 switch (IntNo) { 8529 case Intrinsic::x86_sse2_pslli_w: 8530 NewIntNo = Intrinsic::x86_sse2_psll_w; 8531 break; 8532 case Intrinsic::x86_sse2_pslli_d: 8533 NewIntNo = Intrinsic::x86_sse2_psll_d; 8534 break; 8535 case Intrinsic::x86_sse2_pslli_q: 8536 NewIntNo = Intrinsic::x86_sse2_psll_q; 8537 break; 8538 case Intrinsic::x86_sse2_psrli_w: 8539 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8540 break; 8541 case Intrinsic::x86_sse2_psrli_d: 8542 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8543 break; 8544 case Intrinsic::x86_sse2_psrli_q: 8545 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8546 break; 8547 case Intrinsic::x86_sse2_psrai_w: 8548 NewIntNo = Intrinsic::x86_sse2_psra_w; 8549 break; 8550 case Intrinsic::x86_sse2_psrai_d: 8551 NewIntNo = Intrinsic::x86_sse2_psra_d; 8552 break; 8553 default: { 8554 ShAmtVT = MVT::v2i32; 8555 switch (IntNo) { 8556 case Intrinsic::x86_mmx_pslli_w: 8557 NewIntNo = Intrinsic::x86_mmx_psll_w; 8558 break; 8559 case Intrinsic::x86_mmx_pslli_d: 8560 NewIntNo = Intrinsic::x86_mmx_psll_d; 8561 break; 8562 case Intrinsic::x86_mmx_pslli_q: 8563 NewIntNo = Intrinsic::x86_mmx_psll_q; 8564 break; 8565 case Intrinsic::x86_mmx_psrli_w: 8566 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8567 break; 8568 case Intrinsic::x86_mmx_psrli_d: 8569 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8570 break; 8571 case Intrinsic::x86_mmx_psrli_q: 8572 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8573 break; 8574 case Intrinsic::x86_mmx_psrai_w: 8575 NewIntNo = Intrinsic::x86_mmx_psra_w; 8576 break; 8577 case Intrinsic::x86_mmx_psrai_d: 8578 NewIntNo = Intrinsic::x86_mmx_psra_d; 8579 break; 8580 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8581 } 8582 break; 8583 } 8584 } 8585 8586 // The vector shift intrinsics with scalars uses 32b shift amounts but 8587 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8588 // to be zero. 8589 SDValue ShOps[4]; 8590 ShOps[0] = ShAmt; 8591 ShOps[1] = DAG.getConstant(0, MVT::i32); 8592 if (ShAmtVT == MVT::v4i32) { 8593 ShOps[2] = DAG.getUNDEF(MVT::i32); 8594 ShOps[3] = DAG.getUNDEF(MVT::i32); 8595 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8596 } else { 8597 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8598// FIXME this must be lowered to get rid of the invalid type. 8599 } 8600 8601 EVT VT = Op.getValueType(); 8602 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8603 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8604 DAG.getConstant(NewIntNo, MVT::i32), 8605 Op.getOperand(1), ShAmt); 8606 } 8607 } 8608} 8609 8610SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8611 SelectionDAG &DAG) const { 8612 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8613 MFI->setReturnAddressIsTaken(true); 8614 8615 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8616 DebugLoc dl = Op.getDebugLoc(); 8617 8618 if (Depth > 0) { 8619 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8620 SDValue Offset = 8621 DAG.getConstant(TD->getPointerSize(), 8622 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8623 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8624 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8625 FrameAddr, Offset), 8626 MachinePointerInfo(), false, false, 0); 8627 } 8628 8629 // Just load the return address. 8630 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8631 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8632 RetAddrFI, MachinePointerInfo(), false, false, 0); 8633} 8634 8635SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8636 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8637 MFI->setFrameAddressIsTaken(true); 8638 8639 EVT VT = Op.getValueType(); 8640 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8641 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8642 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8643 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8644 while (Depth--) 8645 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8646 MachinePointerInfo(), 8647 false, false, 0); 8648 return FrameAddr; 8649} 8650 8651SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8652 SelectionDAG &DAG) const { 8653 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8654} 8655 8656SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8657 MachineFunction &MF = DAG.getMachineFunction(); 8658 SDValue Chain = Op.getOperand(0); 8659 SDValue Offset = Op.getOperand(1); 8660 SDValue Handler = Op.getOperand(2); 8661 DebugLoc dl = Op.getDebugLoc(); 8662 8663 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8664 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8665 getPointerTy()); 8666 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8667 8668 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8669 DAG.getIntPtrConstant(TD->getPointerSize())); 8670 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8671 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8672 false, false, 0); 8673 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8674 MF.getRegInfo().addLiveOut(StoreAddrReg); 8675 8676 return DAG.getNode(X86ISD::EH_RETURN, dl, 8677 MVT::Other, 8678 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8679} 8680 8681SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8682 SelectionDAG &DAG) const { 8683 SDValue Root = Op.getOperand(0); 8684 SDValue Trmp = Op.getOperand(1); // trampoline 8685 SDValue FPtr = Op.getOperand(2); // nested function 8686 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8687 DebugLoc dl = Op.getDebugLoc(); 8688 8689 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8690 8691 if (Subtarget->is64Bit()) { 8692 SDValue OutChains[6]; 8693 8694 // Large code-model. 8695 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8696 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8697 8698 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 8699 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 8700 8701 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8702 8703 // Load the pointer to the nested function into R11. 8704 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8705 SDValue Addr = Trmp; 8706 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8707 Addr, MachinePointerInfo(TrmpAddr), 8708 false, false, 0); 8709 8710 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8711 DAG.getConstant(2, MVT::i64)); 8712 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8713 MachinePointerInfo(TrmpAddr, 2), 8714 false, false, 2); 8715 8716 // Load the 'nest' parameter value into R10. 8717 // R10 is specified in X86CallingConv.td 8718 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8719 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8720 DAG.getConstant(10, MVT::i64)); 8721 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8722 Addr, MachinePointerInfo(TrmpAddr, 10), 8723 false, false, 0); 8724 8725 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8726 DAG.getConstant(12, MVT::i64)); 8727 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8728 MachinePointerInfo(TrmpAddr, 12), 8729 false, false, 2); 8730 8731 // Jump to the nested function. 8732 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8733 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8734 DAG.getConstant(20, MVT::i64)); 8735 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8736 Addr, MachinePointerInfo(TrmpAddr, 20), 8737 false, false, 0); 8738 8739 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8740 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8741 DAG.getConstant(22, MVT::i64)); 8742 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8743 MachinePointerInfo(TrmpAddr, 22), 8744 false, false, 0); 8745 8746 SDValue Ops[] = 8747 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8748 return DAG.getMergeValues(Ops, 2, dl); 8749 } else { 8750 const Function *Func = 8751 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8752 CallingConv::ID CC = Func->getCallingConv(); 8753 unsigned NestReg; 8754 8755 switch (CC) { 8756 default: 8757 llvm_unreachable("Unsupported calling convention"); 8758 case CallingConv::C: 8759 case CallingConv::X86_StdCall: { 8760 // Pass 'nest' parameter in ECX. 8761 // Must be kept in sync with X86CallingConv.td 8762 NestReg = X86::ECX; 8763 8764 // Check that ECX wasn't needed by an 'inreg' parameter. 8765 FunctionType *FTy = Func->getFunctionType(); 8766 const AttrListPtr &Attrs = Func->getAttributes(); 8767 8768 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8769 unsigned InRegCount = 0; 8770 unsigned Idx = 1; 8771 8772 for (FunctionType::param_iterator I = FTy->param_begin(), 8773 E = FTy->param_end(); I != E; ++I, ++Idx) 8774 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8775 // FIXME: should only count parameters that are lowered to integers. 8776 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8777 8778 if (InRegCount > 2) { 8779 report_fatal_error("Nest register in use - reduce number of inreg" 8780 " parameters!"); 8781 } 8782 } 8783 break; 8784 } 8785 case CallingConv::X86_FastCall: 8786 case CallingConv::X86_ThisCall: 8787 case CallingConv::Fast: 8788 // Pass 'nest' parameter in EAX. 8789 // Must be kept in sync with X86CallingConv.td 8790 NestReg = X86::EAX; 8791 break; 8792 } 8793 8794 SDValue OutChains[4]; 8795 SDValue Addr, Disp; 8796 8797 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8798 DAG.getConstant(10, MVT::i32)); 8799 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8800 8801 // This is storing the opcode for MOV32ri. 8802 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8803 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 8804 OutChains[0] = DAG.getStore(Root, dl, 8805 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8806 Trmp, MachinePointerInfo(TrmpAddr), 8807 false, false, 0); 8808 8809 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8810 DAG.getConstant(1, MVT::i32)); 8811 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8812 MachinePointerInfo(TrmpAddr, 1), 8813 false, false, 1); 8814 8815 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8816 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8817 DAG.getConstant(5, MVT::i32)); 8818 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8819 MachinePointerInfo(TrmpAddr, 5), 8820 false, false, 1); 8821 8822 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8823 DAG.getConstant(6, MVT::i32)); 8824 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8825 MachinePointerInfo(TrmpAddr, 6), 8826 false, false, 1); 8827 8828 SDValue Ops[] = 8829 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8830 return DAG.getMergeValues(Ops, 2, dl); 8831 } 8832} 8833 8834SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8835 SelectionDAG &DAG) const { 8836 /* 8837 The rounding mode is in bits 11:10 of FPSR, and has the following 8838 settings: 8839 00 Round to nearest 8840 01 Round to -inf 8841 10 Round to +inf 8842 11 Round to 0 8843 8844 FLT_ROUNDS, on the other hand, expects the following: 8845 -1 Undefined 8846 0 Round to 0 8847 1 Round to nearest 8848 2 Round to +inf 8849 3 Round to -inf 8850 8851 To perform the conversion, we do: 8852 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8853 */ 8854 8855 MachineFunction &MF = DAG.getMachineFunction(); 8856 const TargetMachine &TM = MF.getTarget(); 8857 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8858 unsigned StackAlignment = TFI.getStackAlignment(); 8859 EVT VT = Op.getValueType(); 8860 DebugLoc DL = Op.getDebugLoc(); 8861 8862 // Save FP Control Word to stack slot 8863 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8864 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8865 8866 8867 MachineMemOperand *MMO = 8868 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8869 MachineMemOperand::MOStore, 2, 2); 8870 8871 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8872 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8873 DAG.getVTList(MVT::Other), 8874 Ops, 2, MVT::i16, MMO); 8875 8876 // Load FP Control Word from stack slot 8877 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8878 MachinePointerInfo(), false, false, 0); 8879 8880 // Transform as necessary 8881 SDValue CWD1 = 8882 DAG.getNode(ISD::SRL, DL, MVT::i16, 8883 DAG.getNode(ISD::AND, DL, MVT::i16, 8884 CWD, DAG.getConstant(0x800, MVT::i16)), 8885 DAG.getConstant(11, MVT::i8)); 8886 SDValue CWD2 = 8887 DAG.getNode(ISD::SRL, DL, MVT::i16, 8888 DAG.getNode(ISD::AND, DL, MVT::i16, 8889 CWD, DAG.getConstant(0x400, MVT::i16)), 8890 DAG.getConstant(9, MVT::i8)); 8891 8892 SDValue RetVal = 8893 DAG.getNode(ISD::AND, DL, MVT::i16, 8894 DAG.getNode(ISD::ADD, DL, MVT::i16, 8895 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8896 DAG.getConstant(1, MVT::i16)), 8897 DAG.getConstant(3, MVT::i16)); 8898 8899 8900 return DAG.getNode((VT.getSizeInBits() < 16 ? 8901 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8902} 8903 8904SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8905 EVT VT = Op.getValueType(); 8906 EVT OpVT = VT; 8907 unsigned NumBits = VT.getSizeInBits(); 8908 DebugLoc dl = Op.getDebugLoc(); 8909 8910 Op = Op.getOperand(0); 8911 if (VT == MVT::i8) { 8912 // Zero extend to i32 since there is not an i8 bsr. 8913 OpVT = MVT::i32; 8914 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8915 } 8916 8917 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8918 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8919 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8920 8921 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8922 SDValue Ops[] = { 8923 Op, 8924 DAG.getConstant(NumBits+NumBits-1, OpVT), 8925 DAG.getConstant(X86::COND_E, MVT::i8), 8926 Op.getValue(1) 8927 }; 8928 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8929 8930 // Finally xor with NumBits-1. 8931 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8932 8933 if (VT == MVT::i8) 8934 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8935 return Op; 8936} 8937 8938SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8939 EVT VT = Op.getValueType(); 8940 EVT OpVT = VT; 8941 unsigned NumBits = VT.getSizeInBits(); 8942 DebugLoc dl = Op.getDebugLoc(); 8943 8944 Op = Op.getOperand(0); 8945 if (VT == MVT::i8) { 8946 OpVT = MVT::i32; 8947 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8948 } 8949 8950 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8951 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8952 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8953 8954 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8955 SDValue Ops[] = { 8956 Op, 8957 DAG.getConstant(NumBits, OpVT), 8958 DAG.getConstant(X86::COND_E, MVT::i8), 8959 Op.getValue(1) 8960 }; 8961 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8962 8963 if (VT == MVT::i8) 8964 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8965 return Op; 8966} 8967 8968SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8969 EVT VT = Op.getValueType(); 8970 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8971 DebugLoc dl = Op.getDebugLoc(); 8972 8973 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8974 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8975 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8976 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8977 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8978 // 8979 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8980 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8981 // return AloBlo + AloBhi + AhiBlo; 8982 8983 SDValue A = Op.getOperand(0); 8984 SDValue B = Op.getOperand(1); 8985 8986 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8987 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8988 A, DAG.getConstant(32, MVT::i32)); 8989 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8990 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8991 B, DAG.getConstant(32, MVT::i32)); 8992 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8993 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8994 A, B); 8995 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8996 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8997 A, Bhi); 8998 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8999 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9000 Ahi, B); 9001 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9002 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9003 AloBhi, DAG.getConstant(32, MVT::i32)); 9004 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9005 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9006 AhiBlo, DAG.getConstant(32, MVT::i32)); 9007 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9008 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9009 return Res; 9010} 9011 9012SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 9013 9014 EVT VT = Op.getValueType(); 9015 DebugLoc dl = Op.getDebugLoc(); 9016 SDValue R = Op.getOperand(0); 9017 SDValue Amt = Op.getOperand(1); 9018 9019 LLVMContext *Context = DAG.getContext(); 9020 9021 // Must have SSE2. 9022 if (!Subtarget->hasSSE2()) return SDValue(); 9023 9024 // Optimize shl/srl/sra with constant shift amount. 9025 if (isSplatVector(Amt.getNode())) { 9026 SDValue SclrAmt = Amt->getOperand(0); 9027 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 9028 uint64_t ShiftAmt = C->getZExtValue(); 9029 9030 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 9031 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9032 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9033 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9034 9035 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 9036 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9037 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9038 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9039 9040 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 9041 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9042 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9043 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9044 9045 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 9046 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9047 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9048 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9049 9050 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 9051 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9052 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9053 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9054 9055 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 9056 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9057 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9058 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9059 9060 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 9061 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9062 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9063 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9064 9065 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 9066 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9067 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9068 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9069 } 9070 } 9071 9072 // Lower SHL with variable shift amount. 9073 // Cannot lower SHL without SSE2 or later. 9074 if (!Subtarget->hasSSE2()) return SDValue(); 9075 9076 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 9077 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9078 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9079 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 9080 9081 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 9082 9083 std::vector<Constant*> CV(4, CI); 9084 Constant *C = ConstantVector::get(CV); 9085 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9086 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9087 MachinePointerInfo::getConstantPool(), 9088 false, false, 16); 9089 9090 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 9091 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 9092 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 9093 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 9094 } 9095 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 9096 // a = a << 5; 9097 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9098 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9099 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 9100 9101 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 9102 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 9103 9104 std::vector<Constant*> CVM1(16, CM1); 9105 std::vector<Constant*> CVM2(16, CM2); 9106 Constant *C = ConstantVector::get(CVM1); 9107 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9108 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9109 MachinePointerInfo::getConstantPool(), 9110 false, false, 16); 9111 9112 // r = pblendv(r, psllw(r & (char16)15, 4), a); 9113 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9114 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9115 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9116 DAG.getConstant(4, MVT::i32)); 9117 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9118 // a += a 9119 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9120 9121 C = ConstantVector::get(CVM2); 9122 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9123 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9124 MachinePointerInfo::getConstantPool(), 9125 false, false, 16); 9126 9127 // r = pblendv(r, psllw(r & (char16)63, 2), a); 9128 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9129 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9130 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9131 DAG.getConstant(2, MVT::i32)); 9132 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9133 // a += a 9134 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9135 9136 // return pblendv(r, r+r, a); 9137 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 9138 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 9139 return R; 9140 } 9141 return SDValue(); 9142} 9143 9144SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 9145 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 9146 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 9147 // looks for this combo and may remove the "setcc" instruction if the "setcc" 9148 // has only one use. 9149 SDNode *N = Op.getNode(); 9150 SDValue LHS = N->getOperand(0); 9151 SDValue RHS = N->getOperand(1); 9152 unsigned BaseOp = 0; 9153 unsigned Cond = 0; 9154 DebugLoc DL = Op.getDebugLoc(); 9155 switch (Op.getOpcode()) { 9156 default: llvm_unreachable("Unknown ovf instruction!"); 9157 case ISD::SADDO: 9158 // A subtract of one will be selected as a INC. Note that INC doesn't 9159 // set CF, so we can't do this for UADDO. 9160 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9161 if (C->isOne()) { 9162 BaseOp = X86ISD::INC; 9163 Cond = X86::COND_O; 9164 break; 9165 } 9166 BaseOp = X86ISD::ADD; 9167 Cond = X86::COND_O; 9168 break; 9169 case ISD::UADDO: 9170 BaseOp = X86ISD::ADD; 9171 Cond = X86::COND_B; 9172 break; 9173 case ISD::SSUBO: 9174 // A subtract of one will be selected as a DEC. Note that DEC doesn't 9175 // set CF, so we can't do this for USUBO. 9176 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9177 if (C->isOne()) { 9178 BaseOp = X86ISD::DEC; 9179 Cond = X86::COND_O; 9180 break; 9181 } 9182 BaseOp = X86ISD::SUB; 9183 Cond = X86::COND_O; 9184 break; 9185 case ISD::USUBO: 9186 BaseOp = X86ISD::SUB; 9187 Cond = X86::COND_B; 9188 break; 9189 case ISD::SMULO: 9190 BaseOp = X86ISD::SMUL; 9191 Cond = X86::COND_O; 9192 break; 9193 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 9194 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 9195 MVT::i32); 9196 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 9197 9198 SDValue SetCC = 9199 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9200 DAG.getConstant(X86::COND_O, MVT::i32), 9201 SDValue(Sum.getNode(), 2)); 9202 9203 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9204 } 9205 } 9206 9207 // Also sets EFLAGS. 9208 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 9209 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 9210 9211 SDValue SetCC = 9212 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 9213 DAG.getConstant(Cond, MVT::i32), 9214 SDValue(Sum.getNode(), 1)); 9215 9216 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9217} 9218 9219SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ 9220 DebugLoc dl = Op.getDebugLoc(); 9221 SDNode* Node = Op.getNode(); 9222 EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); 9223 EVT VT = Node->getValueType(0); 9224 9225 if (Subtarget->hasSSE2() && VT.isVector()) { 9226 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 9227 ExtraVT.getScalarType().getSizeInBits(); 9228 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 9229 9230 unsigned SHLIntrinsicsID = 0; 9231 unsigned SRAIntrinsicsID = 0; 9232 switch (VT.getSimpleVT().SimpleTy) { 9233 default: 9234 return SDValue(); 9235 case MVT::v2i64: { 9236 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q; 9237 SRAIntrinsicsID = 0; 9238 break; 9239 } 9240 case MVT::v4i32: { 9241 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 9242 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 9243 break; 9244 } 9245 case MVT::v8i16: { 9246 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 9247 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 9248 break; 9249 } 9250 } 9251 9252 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9253 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 9254 Node->getOperand(0), ShAmt); 9255 9256 // In case of 1 bit sext, no need to shr 9257 if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; 9258 9259 if (SRAIntrinsicsID) { 9260 Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9261 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 9262 Tmp1, ShAmt); 9263 } 9264 return Tmp1; 9265 } 9266 9267 return SDValue(); 9268} 9269 9270 9271SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 9272 DebugLoc dl = Op.getDebugLoc(); 9273 9274 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 9275 // There isn't any reason to disable it if the target processor supports it. 9276 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 9277 SDValue Chain = Op.getOperand(0); 9278 SDValue Zero = DAG.getConstant(0, MVT::i32); 9279 SDValue Ops[] = { 9280 DAG.getRegister(X86::ESP, MVT::i32), // Base 9281 DAG.getTargetConstant(1, MVT::i8), // Scale 9282 DAG.getRegister(0, MVT::i32), // Index 9283 DAG.getTargetConstant(0, MVT::i32), // Disp 9284 DAG.getRegister(0, MVT::i32), // Segment. 9285 Zero, 9286 Chain 9287 }; 9288 SDNode *Res = 9289 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9290 array_lengthof(Ops)); 9291 return SDValue(Res, 0); 9292 } 9293 9294 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 9295 if (!isDev) 9296 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9297 9298 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 9299 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 9300 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 9301 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 9302 9303 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 9304 if (!Op1 && !Op2 && !Op3 && Op4) 9305 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 9306 9307 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 9308 if (Op1 && !Op2 && !Op3 && !Op4) 9309 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 9310 9311 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 9312 // (MFENCE)>; 9313 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9314} 9315 9316SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 9317 EVT T = Op.getValueType(); 9318 DebugLoc DL = Op.getDebugLoc(); 9319 unsigned Reg = 0; 9320 unsigned size = 0; 9321 switch(T.getSimpleVT().SimpleTy) { 9322 default: 9323 assert(false && "Invalid value type!"); 9324 case MVT::i8: Reg = X86::AL; size = 1; break; 9325 case MVT::i16: Reg = X86::AX; size = 2; break; 9326 case MVT::i32: Reg = X86::EAX; size = 4; break; 9327 case MVT::i64: 9328 assert(Subtarget->is64Bit() && "Node not type legal!"); 9329 Reg = X86::RAX; size = 8; 9330 break; 9331 } 9332 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 9333 Op.getOperand(2), SDValue()); 9334 SDValue Ops[] = { cpIn.getValue(0), 9335 Op.getOperand(1), 9336 Op.getOperand(3), 9337 DAG.getTargetConstant(size, MVT::i8), 9338 cpIn.getValue(1) }; 9339 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9340 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 9341 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 9342 Ops, 5, T, MMO); 9343 SDValue cpOut = 9344 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 9345 return cpOut; 9346} 9347 9348SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 9349 SelectionDAG &DAG) const { 9350 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9351 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9352 SDValue TheChain = Op.getOperand(0); 9353 DebugLoc dl = Op.getDebugLoc(); 9354 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9355 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9356 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9357 rax.getValue(2)); 9358 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9359 DAG.getConstant(32, MVT::i8)); 9360 SDValue Ops[] = { 9361 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9362 rdx.getValue(1) 9363 }; 9364 return DAG.getMergeValues(Ops, 2, dl); 9365} 9366 9367SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9368 SelectionDAG &DAG) const { 9369 EVT SrcVT = Op.getOperand(0).getValueType(); 9370 EVT DstVT = Op.getValueType(); 9371 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9372 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9373 assert((DstVT == MVT::i64 || 9374 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9375 "Unexpected custom BITCAST"); 9376 // i64 <=> MMX conversions are Legal. 9377 if (SrcVT==MVT::i64 && DstVT.isVector()) 9378 return Op; 9379 if (DstVT==MVT::i64 && SrcVT.isVector()) 9380 return Op; 9381 // MMX <=> MMX conversions are Legal. 9382 if (SrcVT.isVector() && DstVT.isVector()) 9383 return Op; 9384 // All other conversions need to be expanded. 9385 return SDValue(); 9386} 9387 9388SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9389 SDNode *Node = Op.getNode(); 9390 DebugLoc dl = Node->getDebugLoc(); 9391 EVT T = Node->getValueType(0); 9392 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9393 DAG.getConstant(0, T), Node->getOperand(2)); 9394 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9395 cast<AtomicSDNode>(Node)->getMemoryVT(), 9396 Node->getOperand(0), 9397 Node->getOperand(1), negOp, 9398 cast<AtomicSDNode>(Node)->getSrcValue(), 9399 cast<AtomicSDNode>(Node)->getAlignment()); 9400} 9401 9402static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9403 EVT VT = Op.getNode()->getValueType(0); 9404 9405 // Let legalize expand this if it isn't a legal type yet. 9406 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9407 return SDValue(); 9408 9409 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9410 9411 unsigned Opc; 9412 bool ExtraOp = false; 9413 switch (Op.getOpcode()) { 9414 default: assert(0 && "Invalid code"); 9415 case ISD::ADDC: Opc = X86ISD::ADD; break; 9416 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9417 case ISD::SUBC: Opc = X86ISD::SUB; break; 9418 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9419 } 9420 9421 if (!ExtraOp) 9422 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9423 Op.getOperand(1)); 9424 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9425 Op.getOperand(1), Op.getOperand(2)); 9426} 9427 9428/// LowerOperation - Provide custom lowering hooks for some operations. 9429/// 9430SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9431 switch (Op.getOpcode()) { 9432 default: llvm_unreachable("Should not custom lower this!"); 9433 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 9434 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9435 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9436 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9437 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9438 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9439 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9440 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9441 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9442 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9443 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9444 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9445 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9446 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9447 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9448 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9449 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9450 case ISD::SHL_PARTS: 9451 case ISD::SRA_PARTS: 9452 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 9453 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9454 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9455 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9456 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9457 case ISD::FABS: return LowerFABS(Op, DAG); 9458 case ISD::FNEG: return LowerFNEG(Op, DAG); 9459 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9460 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 9461 case ISD::SETCC: return LowerSETCC(Op, DAG); 9462 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9463 case ISD::SELECT: return LowerSELECT(Op, DAG); 9464 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9465 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9466 case ISD::VASTART: return LowerVASTART(Op, DAG); 9467 case ISD::VAARG: return LowerVAARG(Op, DAG); 9468 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9469 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9470 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9471 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9472 case ISD::FRAME_TO_ARGS_OFFSET: 9473 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9474 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9475 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9476 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9477 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9478 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9479 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9480 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9481 case ISD::SRA: 9482 case ISD::SRL: 9483 case ISD::SHL: return LowerShift(Op, DAG); 9484 case ISD::SADDO: 9485 case ISD::UADDO: 9486 case ISD::SSUBO: 9487 case ISD::USUBO: 9488 case ISD::SMULO: 9489 case ISD::UMULO: return LowerXALUO(Op, DAG); 9490 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9491 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9492 case ISD::ADDC: 9493 case ISD::ADDE: 9494 case ISD::SUBC: 9495 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9496 } 9497} 9498 9499void X86TargetLowering:: 9500ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9501 SelectionDAG &DAG, unsigned NewOp) const { 9502 EVT T = Node->getValueType(0); 9503 DebugLoc dl = Node->getDebugLoc(); 9504 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9505 9506 SDValue Chain = Node->getOperand(0); 9507 SDValue In1 = Node->getOperand(1); 9508 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9509 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9510 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9511 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9512 SDValue Ops[] = { Chain, In1, In2L, In2H }; 9513 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9514 SDValue Result = 9515 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 9516 cast<MemSDNode>(Node)->getMemOperand()); 9517 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 9518 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9519 Results.push_back(Result.getValue(2)); 9520} 9521 9522/// ReplaceNodeResults - Replace a node with an illegal result type 9523/// with a new node built out of custom code. 9524void X86TargetLowering::ReplaceNodeResults(SDNode *N, 9525 SmallVectorImpl<SDValue>&Results, 9526 SelectionDAG &DAG) const { 9527 DebugLoc dl = N->getDebugLoc(); 9528 switch (N->getOpcode()) { 9529 default: 9530 assert(false && "Do not know how to custom type legalize this operation!"); 9531 return; 9532 case ISD::SIGN_EXTEND_INREG: 9533 case ISD::ADDC: 9534 case ISD::ADDE: 9535 case ISD::SUBC: 9536 case ISD::SUBE: 9537 // We don't want to expand or promote these. 9538 return; 9539 case ISD::FP_TO_SINT: { 9540 std::pair<SDValue,SDValue> Vals = 9541 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 9542 SDValue FIST = Vals.first, StackSlot = Vals.second; 9543 if (FIST.getNode() != 0) { 9544 EVT VT = N->getValueType(0); 9545 // Return a load from the stack slot. 9546 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 9547 MachinePointerInfo(), false, false, 0)); 9548 } 9549 return; 9550 } 9551 case ISD::READCYCLECOUNTER: { 9552 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9553 SDValue TheChain = N->getOperand(0); 9554 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9555 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 9556 rd.getValue(1)); 9557 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 9558 eax.getValue(2)); 9559 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 9560 SDValue Ops[] = { eax, edx }; 9561 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 9562 Results.push_back(edx.getValue(1)); 9563 return; 9564 } 9565 case ISD::ATOMIC_CMP_SWAP: { 9566 EVT T = N->getValueType(0); 9567 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 9568 SDValue cpInL, cpInH; 9569 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9570 DAG.getConstant(0, MVT::i32)); 9571 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9572 DAG.getConstant(1, MVT::i32)); 9573 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 9574 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 9575 cpInL.getValue(1)); 9576 SDValue swapInL, swapInH; 9577 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9578 DAG.getConstant(0, MVT::i32)); 9579 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9580 DAG.getConstant(1, MVT::i32)); 9581 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 9582 cpInH.getValue(1)); 9583 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 9584 swapInL.getValue(1)); 9585 SDValue Ops[] = { swapInH.getValue(0), 9586 N->getOperand(1), 9587 swapInH.getValue(1) }; 9588 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9589 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 9590 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 9591 Ops, 3, T, MMO); 9592 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 9593 MVT::i32, Result.getValue(1)); 9594 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 9595 MVT::i32, cpOutL.getValue(2)); 9596 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 9597 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9598 Results.push_back(cpOutH.getValue(1)); 9599 return; 9600 } 9601 case ISD::ATOMIC_LOAD_ADD: 9602 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 9603 return; 9604 case ISD::ATOMIC_LOAD_AND: 9605 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 9606 return; 9607 case ISD::ATOMIC_LOAD_NAND: 9608 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 9609 return; 9610 case ISD::ATOMIC_LOAD_OR: 9611 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 9612 return; 9613 case ISD::ATOMIC_LOAD_SUB: 9614 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 9615 return; 9616 case ISD::ATOMIC_LOAD_XOR: 9617 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 9618 return; 9619 case ISD::ATOMIC_SWAP: 9620 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 9621 return; 9622 } 9623} 9624 9625const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 9626 switch (Opcode) { 9627 default: return NULL; 9628 case X86ISD::BSF: return "X86ISD::BSF"; 9629 case X86ISD::BSR: return "X86ISD::BSR"; 9630 case X86ISD::SHLD: return "X86ISD::SHLD"; 9631 case X86ISD::SHRD: return "X86ISD::SHRD"; 9632 case X86ISD::FAND: return "X86ISD::FAND"; 9633 case X86ISD::FOR: return "X86ISD::FOR"; 9634 case X86ISD::FXOR: return "X86ISD::FXOR"; 9635 case X86ISD::FSRL: return "X86ISD::FSRL"; 9636 case X86ISD::FILD: return "X86ISD::FILD"; 9637 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 9638 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 9639 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 9640 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 9641 case X86ISD::FLD: return "X86ISD::FLD"; 9642 case X86ISD::FST: return "X86ISD::FST"; 9643 case X86ISD::CALL: return "X86ISD::CALL"; 9644 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 9645 case X86ISD::BT: return "X86ISD::BT"; 9646 case X86ISD::CMP: return "X86ISD::CMP"; 9647 case X86ISD::COMI: return "X86ISD::COMI"; 9648 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 9649 case X86ISD::SETCC: return "X86ISD::SETCC"; 9650 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 9651 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 9652 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 9653 case X86ISD::CMOV: return "X86ISD::CMOV"; 9654 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 9655 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 9656 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 9657 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 9658 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 9659 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 9660 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 9661 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 9662 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 9663 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 9664 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 9665 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 9666 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 9667 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 9668 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 9669 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 9670 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 9671 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 9672 case X86ISD::FMAX: return "X86ISD::FMAX"; 9673 case X86ISD::FMIN: return "X86ISD::FMIN"; 9674 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9675 case X86ISD::FRCP: return "X86ISD::FRCP"; 9676 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9677 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9678 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9679 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9680 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9681 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9682 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9683 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9684 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9685 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9686 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9687 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9688 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9689 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9690 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9691 case X86ISD::VSHL: return "X86ISD::VSHL"; 9692 case X86ISD::VSRL: return "X86ISD::VSRL"; 9693 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9694 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9695 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9696 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9697 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9698 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9699 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9700 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9701 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9702 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9703 case X86ISD::ADD: return "X86ISD::ADD"; 9704 case X86ISD::SUB: return "X86ISD::SUB"; 9705 case X86ISD::ADC: return "X86ISD::ADC"; 9706 case X86ISD::SBB: return "X86ISD::SBB"; 9707 case X86ISD::SMUL: return "X86ISD::SMUL"; 9708 case X86ISD::UMUL: return "X86ISD::UMUL"; 9709 case X86ISD::INC: return "X86ISD::INC"; 9710 case X86ISD::DEC: return "X86ISD::DEC"; 9711 case X86ISD::OR: return "X86ISD::OR"; 9712 case X86ISD::XOR: return "X86ISD::XOR"; 9713 case X86ISD::AND: return "X86ISD::AND"; 9714 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 9715 case X86ISD::PTEST: return "X86ISD::PTEST"; 9716 case X86ISD::TESTP: return "X86ISD::TESTP"; 9717 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 9718 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 9719 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 9720 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 9721 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 9722 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 9723 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 9724 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 9725 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 9726 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 9727 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 9728 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 9729 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 9730 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 9731 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 9732 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 9733 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 9734 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 9735 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 9736 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 9737 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 9738 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 9739 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 9740 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 9741 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 9742 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 9743 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 9744 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 9745 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 9746 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 9747 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9748 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9749 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9750 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9751 case X86ISD::VPERMIL: return "X86ISD::VPERMIL"; 9752 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9753 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9754 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9755 } 9756} 9757 9758// isLegalAddressingMode - Return true if the addressing mode represented 9759// by AM is legal for this target, for a load/store of the specified type. 9760bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9761 Type *Ty) const { 9762 // X86 supports extremely general addressing modes. 9763 CodeModel::Model M = getTargetMachine().getCodeModel(); 9764 Reloc::Model R = getTargetMachine().getRelocationModel(); 9765 9766 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9767 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9768 return false; 9769 9770 if (AM.BaseGV) { 9771 unsigned GVFlags = 9772 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9773 9774 // If a reference to this global requires an extra load, we can't fold it. 9775 if (isGlobalStubReference(GVFlags)) 9776 return false; 9777 9778 // If BaseGV requires a register for the PIC base, we cannot also have a 9779 // BaseReg specified. 9780 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9781 return false; 9782 9783 // If lower 4G is not available, then we must use rip-relative addressing. 9784 if ((M != CodeModel::Small || R != Reloc::Static) && 9785 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9786 return false; 9787 } 9788 9789 switch (AM.Scale) { 9790 case 0: 9791 case 1: 9792 case 2: 9793 case 4: 9794 case 8: 9795 // These scales always work. 9796 break; 9797 case 3: 9798 case 5: 9799 case 9: 9800 // These scales are formed with basereg+scalereg. Only accept if there is 9801 // no basereg yet. 9802 if (AM.HasBaseReg) 9803 return false; 9804 break; 9805 default: // Other stuff never works. 9806 return false; 9807 } 9808 9809 return true; 9810} 9811 9812 9813bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 9814 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9815 return false; 9816 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9817 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9818 if (NumBits1 <= NumBits2) 9819 return false; 9820 return true; 9821} 9822 9823bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9824 if (!VT1.isInteger() || !VT2.isInteger()) 9825 return false; 9826 unsigned NumBits1 = VT1.getSizeInBits(); 9827 unsigned NumBits2 = VT2.getSizeInBits(); 9828 if (NumBits1 <= NumBits2) 9829 return false; 9830 return true; 9831} 9832 9833bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 9834 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9835 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9836} 9837 9838bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9839 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9840 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9841} 9842 9843bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9844 // i16 instructions are longer (0x66 prefix) and potentially slower. 9845 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9846} 9847 9848/// isShuffleMaskLegal - Targets can use this to indicate that they only 9849/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9850/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9851/// are assumed to be legal. 9852bool 9853X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9854 EVT VT) const { 9855 // Very little shuffling can be done for 64-bit vectors right now. 9856 if (VT.getSizeInBits() == 64) 9857 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9858 9859 // FIXME: pshufb, blends, shifts. 9860 return (VT.getVectorNumElements() == 2 || 9861 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9862 isMOVLMask(M, VT) || 9863 isSHUFPMask(M, VT) || 9864 isPSHUFDMask(M, VT) || 9865 isPSHUFHWMask(M, VT) || 9866 isPSHUFLWMask(M, VT) || 9867 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9868 isUNPCKLMask(M, VT) || 9869 isUNPCKHMask(M, VT) || 9870 isUNPCKL_v_undef_Mask(M, VT) || 9871 isUNPCKH_v_undef_Mask(M, VT)); 9872} 9873 9874bool 9875X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9876 EVT VT) const { 9877 unsigned NumElts = VT.getVectorNumElements(); 9878 // FIXME: This collection of masks seems suspect. 9879 if (NumElts == 2) 9880 return true; 9881 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9882 return (isMOVLMask(Mask, VT) || 9883 isCommutedMOVLMask(Mask, VT, true) || 9884 isSHUFPMask(Mask, VT) || 9885 isCommutedSHUFPMask(Mask, VT)); 9886 } 9887 return false; 9888} 9889 9890//===----------------------------------------------------------------------===// 9891// X86 Scheduler Hooks 9892//===----------------------------------------------------------------------===// 9893 9894// private utility function 9895MachineBasicBlock * 9896X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9897 MachineBasicBlock *MBB, 9898 unsigned regOpc, 9899 unsigned immOpc, 9900 unsigned LoadOpc, 9901 unsigned CXchgOpc, 9902 unsigned notOpc, 9903 unsigned EAXreg, 9904 TargetRegisterClass *RC, 9905 bool invSrc) const { 9906 // For the atomic bitwise operator, we generate 9907 // thisMBB: 9908 // newMBB: 9909 // ld t1 = [bitinstr.addr] 9910 // op t2 = t1, [bitinstr.val] 9911 // mov EAX = t1 9912 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9913 // bz newMBB 9914 // fallthrough -->nextMBB 9915 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9916 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9917 MachineFunction::iterator MBBIter = MBB; 9918 ++MBBIter; 9919 9920 /// First build the CFG 9921 MachineFunction *F = MBB->getParent(); 9922 MachineBasicBlock *thisMBB = MBB; 9923 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9924 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9925 F->insert(MBBIter, newMBB); 9926 F->insert(MBBIter, nextMBB); 9927 9928 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9929 nextMBB->splice(nextMBB->begin(), thisMBB, 9930 llvm::next(MachineBasicBlock::iterator(bInstr)), 9931 thisMBB->end()); 9932 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9933 9934 // Update thisMBB to fall through to newMBB 9935 thisMBB->addSuccessor(newMBB); 9936 9937 // newMBB jumps to itself and fall through to nextMBB 9938 newMBB->addSuccessor(nextMBB); 9939 newMBB->addSuccessor(newMBB); 9940 9941 // Insert instructions into newMBB based on incoming instruction 9942 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9943 "unexpected number of operands"); 9944 DebugLoc dl = bInstr->getDebugLoc(); 9945 MachineOperand& destOper = bInstr->getOperand(0); 9946 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9947 int numArgs = bInstr->getNumOperands() - 1; 9948 for (int i=0; i < numArgs; ++i) 9949 argOpers[i] = &bInstr->getOperand(i+1); 9950 9951 // x86 address has 4 operands: base, index, scale, and displacement 9952 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9953 int valArgIndx = lastAddrIndx + 1; 9954 9955 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9956 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9957 for (int i=0; i <= lastAddrIndx; ++i) 9958 (*MIB).addOperand(*argOpers[i]); 9959 9960 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9961 if (invSrc) { 9962 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9963 } 9964 else 9965 tt = t1; 9966 9967 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9968 assert((argOpers[valArgIndx]->isReg() || 9969 argOpers[valArgIndx]->isImm()) && 9970 "invalid operand"); 9971 if (argOpers[valArgIndx]->isReg()) 9972 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9973 else 9974 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9975 MIB.addReg(tt); 9976 (*MIB).addOperand(*argOpers[valArgIndx]); 9977 9978 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9979 MIB.addReg(t1); 9980 9981 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9982 for (int i=0; i <= lastAddrIndx; ++i) 9983 (*MIB).addOperand(*argOpers[i]); 9984 MIB.addReg(t2); 9985 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9986 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9987 bInstr->memoperands_end()); 9988 9989 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9990 MIB.addReg(EAXreg); 9991 9992 // insert branch 9993 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9994 9995 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9996 return nextMBB; 9997} 9998 9999// private utility function: 64 bit atomics on 32 bit host. 10000MachineBasicBlock * 10001X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 10002 MachineBasicBlock *MBB, 10003 unsigned regOpcL, 10004 unsigned regOpcH, 10005 unsigned immOpcL, 10006 unsigned immOpcH, 10007 bool invSrc) const { 10008 // For the atomic bitwise operator, we generate 10009 // thisMBB (instructions are in pairs, except cmpxchg8b) 10010 // ld t1,t2 = [bitinstr.addr] 10011 // newMBB: 10012 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 10013 // op t5, t6 <- out1, out2, [bitinstr.val] 10014 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 10015 // mov ECX, EBX <- t5, t6 10016 // mov EAX, EDX <- t1, t2 10017 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 10018 // mov t3, t4 <- EAX, EDX 10019 // bz newMBB 10020 // result in out1, out2 10021 // fallthrough -->nextMBB 10022 10023 const TargetRegisterClass *RC = X86::GR32RegisterClass; 10024 const unsigned LoadOpc = X86::MOV32rm; 10025 const unsigned NotOpc = X86::NOT32r; 10026 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10027 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10028 MachineFunction::iterator MBBIter = MBB; 10029 ++MBBIter; 10030 10031 /// First build the CFG 10032 MachineFunction *F = MBB->getParent(); 10033 MachineBasicBlock *thisMBB = MBB; 10034 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10035 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10036 F->insert(MBBIter, newMBB); 10037 F->insert(MBBIter, nextMBB); 10038 10039 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10040 nextMBB->splice(nextMBB->begin(), thisMBB, 10041 llvm::next(MachineBasicBlock::iterator(bInstr)), 10042 thisMBB->end()); 10043 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10044 10045 // Update thisMBB to fall through to newMBB 10046 thisMBB->addSuccessor(newMBB); 10047 10048 // newMBB jumps to itself and fall through to nextMBB 10049 newMBB->addSuccessor(nextMBB); 10050 newMBB->addSuccessor(newMBB); 10051 10052 DebugLoc dl = bInstr->getDebugLoc(); 10053 // Insert instructions into newMBB based on incoming instruction 10054 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 10055 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 10056 "unexpected number of operands"); 10057 MachineOperand& dest1Oper = bInstr->getOperand(0); 10058 MachineOperand& dest2Oper = bInstr->getOperand(1); 10059 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10060 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 10061 argOpers[i] = &bInstr->getOperand(i+2); 10062 10063 // We use some of the operands multiple times, so conservatively just 10064 // clear any kill flags that might be present. 10065 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 10066 argOpers[i]->setIsKill(false); 10067 } 10068 10069 // x86 address has 5 operands: base, index, scale, displacement, and segment. 10070 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10071 10072 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10073 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 10074 for (int i=0; i <= lastAddrIndx; ++i) 10075 (*MIB).addOperand(*argOpers[i]); 10076 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10077 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 10078 // add 4 to displacement. 10079 for (int i=0; i <= lastAddrIndx-2; ++i) 10080 (*MIB).addOperand(*argOpers[i]); 10081 MachineOperand newOp3 = *(argOpers[3]); 10082 if (newOp3.isImm()) 10083 newOp3.setImm(newOp3.getImm()+4); 10084 else 10085 newOp3.setOffset(newOp3.getOffset()+4); 10086 (*MIB).addOperand(newOp3); 10087 (*MIB).addOperand(*argOpers[lastAddrIndx]); 10088 10089 // t3/4 are defined later, at the bottom of the loop 10090 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 10091 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 10092 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 10093 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 10094 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 10095 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 10096 10097 // The subsequent operations should be using the destination registers of 10098 //the PHI instructions. 10099 if (invSrc) { 10100 t1 = F->getRegInfo().createVirtualRegister(RC); 10101 t2 = F->getRegInfo().createVirtualRegister(RC); 10102 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 10103 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 10104 } else { 10105 t1 = dest1Oper.getReg(); 10106 t2 = dest2Oper.getReg(); 10107 } 10108 10109 int valArgIndx = lastAddrIndx + 1; 10110 assert((argOpers[valArgIndx]->isReg() || 10111 argOpers[valArgIndx]->isImm()) && 10112 "invalid operand"); 10113 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 10114 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 10115 if (argOpers[valArgIndx]->isReg()) 10116 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 10117 else 10118 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 10119 if (regOpcL != X86::MOV32rr) 10120 MIB.addReg(t1); 10121 (*MIB).addOperand(*argOpers[valArgIndx]); 10122 assert(argOpers[valArgIndx + 1]->isReg() == 10123 argOpers[valArgIndx]->isReg()); 10124 assert(argOpers[valArgIndx + 1]->isImm() == 10125 argOpers[valArgIndx]->isImm()); 10126 if (argOpers[valArgIndx + 1]->isReg()) 10127 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 10128 else 10129 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 10130 if (regOpcH != X86::MOV32rr) 10131 MIB.addReg(t2); 10132 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 10133 10134 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10135 MIB.addReg(t1); 10136 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 10137 MIB.addReg(t2); 10138 10139 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 10140 MIB.addReg(t5); 10141 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 10142 MIB.addReg(t6); 10143 10144 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 10145 for (int i=0; i <= lastAddrIndx; ++i) 10146 (*MIB).addOperand(*argOpers[i]); 10147 10148 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10149 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10150 bInstr->memoperands_end()); 10151 10152 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 10153 MIB.addReg(X86::EAX); 10154 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 10155 MIB.addReg(X86::EDX); 10156 10157 // insert branch 10158 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10159 10160 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10161 return nextMBB; 10162} 10163 10164// private utility function 10165MachineBasicBlock * 10166X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 10167 MachineBasicBlock *MBB, 10168 unsigned cmovOpc) const { 10169 // For the atomic min/max operator, we generate 10170 // thisMBB: 10171 // newMBB: 10172 // ld t1 = [min/max.addr] 10173 // mov t2 = [min/max.val] 10174 // cmp t1, t2 10175 // cmov[cond] t2 = t1 10176 // mov EAX = t1 10177 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10178 // bz newMBB 10179 // fallthrough -->nextMBB 10180 // 10181 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10182 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10183 MachineFunction::iterator MBBIter = MBB; 10184 ++MBBIter; 10185 10186 /// First build the CFG 10187 MachineFunction *F = MBB->getParent(); 10188 MachineBasicBlock *thisMBB = MBB; 10189 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10190 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10191 F->insert(MBBIter, newMBB); 10192 F->insert(MBBIter, nextMBB); 10193 10194 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10195 nextMBB->splice(nextMBB->begin(), thisMBB, 10196 llvm::next(MachineBasicBlock::iterator(mInstr)), 10197 thisMBB->end()); 10198 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10199 10200 // Update thisMBB to fall through to newMBB 10201 thisMBB->addSuccessor(newMBB); 10202 10203 // newMBB jumps to newMBB and fall through to nextMBB 10204 newMBB->addSuccessor(nextMBB); 10205 newMBB->addSuccessor(newMBB); 10206 10207 DebugLoc dl = mInstr->getDebugLoc(); 10208 // Insert instructions into newMBB based on incoming instruction 10209 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10210 "unexpected number of operands"); 10211 MachineOperand& destOper = mInstr->getOperand(0); 10212 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10213 int numArgs = mInstr->getNumOperands() - 1; 10214 for (int i=0; i < numArgs; ++i) 10215 argOpers[i] = &mInstr->getOperand(i+1); 10216 10217 // x86 address has 4 operands: base, index, scale, and displacement 10218 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10219 int valArgIndx = lastAddrIndx + 1; 10220 10221 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10222 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 10223 for (int i=0; i <= lastAddrIndx; ++i) 10224 (*MIB).addOperand(*argOpers[i]); 10225 10226 // We only support register and immediate values 10227 assert((argOpers[valArgIndx]->isReg() || 10228 argOpers[valArgIndx]->isImm()) && 10229 "invalid operand"); 10230 10231 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10232 if (argOpers[valArgIndx]->isReg()) 10233 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 10234 else 10235 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 10236 (*MIB).addOperand(*argOpers[valArgIndx]); 10237 10238 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10239 MIB.addReg(t1); 10240 10241 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 10242 MIB.addReg(t1); 10243 MIB.addReg(t2); 10244 10245 // Generate movc 10246 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10247 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 10248 MIB.addReg(t2); 10249 MIB.addReg(t1); 10250 10251 // Cmp and exchange if none has modified the memory location 10252 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 10253 for (int i=0; i <= lastAddrIndx; ++i) 10254 (*MIB).addOperand(*argOpers[i]); 10255 MIB.addReg(t3); 10256 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10257 (*MIB).setMemRefs(mInstr->memoperands_begin(), 10258 mInstr->memoperands_end()); 10259 10260 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10261 MIB.addReg(X86::EAX); 10262 10263 // insert branch 10264 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10265 10266 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 10267 return nextMBB; 10268} 10269 10270// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 10271// or XMM0_V32I8 in AVX all of this code can be replaced with that 10272// in the .td file. 10273MachineBasicBlock * 10274X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 10275 unsigned numArgs, bool memArg) const { 10276 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 10277 "Target must have SSE4.2 or AVX features enabled"); 10278 10279 DebugLoc dl = MI->getDebugLoc(); 10280 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10281 unsigned Opc; 10282 if (!Subtarget->hasAVX()) { 10283 if (memArg) 10284 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 10285 else 10286 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 10287 } else { 10288 if (memArg) 10289 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 10290 else 10291 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 10292 } 10293 10294 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 10295 for (unsigned i = 0; i < numArgs; ++i) { 10296 MachineOperand &Op = MI->getOperand(i+1); 10297 if (!(Op.isReg() && Op.isImplicit())) 10298 MIB.addOperand(Op); 10299 } 10300 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 10301 .addReg(X86::XMM0); 10302 10303 MI->eraseFromParent(); 10304 return BB; 10305} 10306 10307MachineBasicBlock * 10308X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 10309 DebugLoc dl = MI->getDebugLoc(); 10310 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10311 10312 // Address into RAX/EAX, other two args into ECX, EDX. 10313 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 10314 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 10315 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 10316 for (int i = 0; i < X86::AddrNumOperands; ++i) 10317 MIB.addOperand(MI->getOperand(i)); 10318 10319 unsigned ValOps = X86::AddrNumOperands; 10320 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10321 .addReg(MI->getOperand(ValOps).getReg()); 10322 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 10323 .addReg(MI->getOperand(ValOps+1).getReg()); 10324 10325 // The instruction doesn't actually take any operands though. 10326 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 10327 10328 MI->eraseFromParent(); // The pseudo is gone now. 10329 return BB; 10330} 10331 10332MachineBasicBlock * 10333X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 10334 DebugLoc dl = MI->getDebugLoc(); 10335 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10336 10337 // First arg in ECX, the second in EAX. 10338 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10339 .addReg(MI->getOperand(0).getReg()); 10340 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 10341 .addReg(MI->getOperand(1).getReg()); 10342 10343 // The instruction doesn't actually take any operands though. 10344 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 10345 10346 MI->eraseFromParent(); // The pseudo is gone now. 10347 return BB; 10348} 10349 10350MachineBasicBlock * 10351X86TargetLowering::EmitVAARG64WithCustomInserter( 10352 MachineInstr *MI, 10353 MachineBasicBlock *MBB) const { 10354 // Emit va_arg instruction on X86-64. 10355 10356 // Operands to this pseudo-instruction: 10357 // 0 ) Output : destination address (reg) 10358 // 1-5) Input : va_list address (addr, i64mem) 10359 // 6 ) ArgSize : Size (in bytes) of vararg type 10360 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10361 // 8 ) Align : Alignment of type 10362 // 9 ) EFLAGS (implicit-def) 10363 10364 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10365 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10366 10367 unsigned DestReg = MI->getOperand(0).getReg(); 10368 MachineOperand &Base = MI->getOperand(1); 10369 MachineOperand &Scale = MI->getOperand(2); 10370 MachineOperand &Index = MI->getOperand(3); 10371 MachineOperand &Disp = MI->getOperand(4); 10372 MachineOperand &Segment = MI->getOperand(5); 10373 unsigned ArgSize = MI->getOperand(6).getImm(); 10374 unsigned ArgMode = MI->getOperand(7).getImm(); 10375 unsigned Align = MI->getOperand(8).getImm(); 10376 10377 // Memory Reference 10378 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10379 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10380 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10381 10382 // Machine Information 10383 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10384 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10385 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10386 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10387 DebugLoc DL = MI->getDebugLoc(); 10388 10389 // struct va_list { 10390 // i32 gp_offset 10391 // i32 fp_offset 10392 // i64 overflow_area (address) 10393 // i64 reg_save_area (address) 10394 // } 10395 // sizeof(va_list) = 24 10396 // alignment(va_list) = 8 10397 10398 unsigned TotalNumIntRegs = 6; 10399 unsigned TotalNumXMMRegs = 8; 10400 bool UseGPOffset = (ArgMode == 1); 10401 bool UseFPOffset = (ArgMode == 2); 10402 unsigned MaxOffset = TotalNumIntRegs * 8 + 10403 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10404 10405 /* Align ArgSize to a multiple of 8 */ 10406 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10407 bool NeedsAlign = (Align > 8); 10408 10409 MachineBasicBlock *thisMBB = MBB; 10410 MachineBasicBlock *overflowMBB; 10411 MachineBasicBlock *offsetMBB; 10412 MachineBasicBlock *endMBB; 10413 10414 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10415 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10416 unsigned OffsetReg = 0; 10417 10418 if (!UseGPOffset && !UseFPOffset) { 10419 // If we only pull from the overflow region, we don't create a branch. 10420 // We don't need to alter control flow. 10421 OffsetDestReg = 0; // unused 10422 OverflowDestReg = DestReg; 10423 10424 offsetMBB = NULL; 10425 overflowMBB = thisMBB; 10426 endMBB = thisMBB; 10427 } else { 10428 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10429 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10430 // If not, pull from overflow_area. (branch to overflowMBB) 10431 // 10432 // thisMBB 10433 // | . 10434 // | . 10435 // offsetMBB overflowMBB 10436 // | . 10437 // | . 10438 // endMBB 10439 10440 // Registers for the PHI in endMBB 10441 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10442 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10443 10444 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10445 MachineFunction *MF = MBB->getParent(); 10446 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10447 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10448 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10449 10450 MachineFunction::iterator MBBIter = MBB; 10451 ++MBBIter; 10452 10453 // Insert the new basic blocks 10454 MF->insert(MBBIter, offsetMBB); 10455 MF->insert(MBBIter, overflowMBB); 10456 MF->insert(MBBIter, endMBB); 10457 10458 // Transfer the remainder of MBB and its successor edges to endMBB. 10459 endMBB->splice(endMBB->begin(), thisMBB, 10460 llvm::next(MachineBasicBlock::iterator(MI)), 10461 thisMBB->end()); 10462 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10463 10464 // Make offsetMBB and overflowMBB successors of thisMBB 10465 thisMBB->addSuccessor(offsetMBB); 10466 thisMBB->addSuccessor(overflowMBB); 10467 10468 // endMBB is a successor of both offsetMBB and overflowMBB 10469 offsetMBB->addSuccessor(endMBB); 10470 overflowMBB->addSuccessor(endMBB); 10471 10472 // Load the offset value into a register 10473 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10474 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10475 .addOperand(Base) 10476 .addOperand(Scale) 10477 .addOperand(Index) 10478 .addDisp(Disp, UseFPOffset ? 4 : 0) 10479 .addOperand(Segment) 10480 .setMemRefs(MMOBegin, MMOEnd); 10481 10482 // Check if there is enough room left to pull this argument. 10483 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10484 .addReg(OffsetReg) 10485 .addImm(MaxOffset + 8 - ArgSizeA8); 10486 10487 // Branch to "overflowMBB" if offset >= max 10488 // Fall through to "offsetMBB" otherwise 10489 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10490 .addMBB(overflowMBB); 10491 } 10492 10493 // In offsetMBB, emit code to use the reg_save_area. 10494 if (offsetMBB) { 10495 assert(OffsetReg != 0); 10496 10497 // Read the reg_save_area address. 10498 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10499 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10500 .addOperand(Base) 10501 .addOperand(Scale) 10502 .addOperand(Index) 10503 .addDisp(Disp, 16) 10504 .addOperand(Segment) 10505 .setMemRefs(MMOBegin, MMOEnd); 10506 10507 // Zero-extend the offset 10508 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 10509 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 10510 .addImm(0) 10511 .addReg(OffsetReg) 10512 .addImm(X86::sub_32bit); 10513 10514 // Add the offset to the reg_save_area to get the final address. 10515 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 10516 .addReg(OffsetReg64) 10517 .addReg(RegSaveReg); 10518 10519 // Compute the offset for the next argument 10520 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10521 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 10522 .addReg(OffsetReg) 10523 .addImm(UseFPOffset ? 16 : 8); 10524 10525 // Store it back into the va_list. 10526 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 10527 .addOperand(Base) 10528 .addOperand(Scale) 10529 .addOperand(Index) 10530 .addDisp(Disp, UseFPOffset ? 4 : 0) 10531 .addOperand(Segment) 10532 .addReg(NextOffsetReg) 10533 .setMemRefs(MMOBegin, MMOEnd); 10534 10535 // Jump to endMBB 10536 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 10537 .addMBB(endMBB); 10538 } 10539 10540 // 10541 // Emit code to use overflow area 10542 // 10543 10544 // Load the overflow_area address into a register. 10545 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 10546 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 10547 .addOperand(Base) 10548 .addOperand(Scale) 10549 .addOperand(Index) 10550 .addDisp(Disp, 8) 10551 .addOperand(Segment) 10552 .setMemRefs(MMOBegin, MMOEnd); 10553 10554 // If we need to align it, do so. Otherwise, just copy the address 10555 // to OverflowDestReg. 10556 if (NeedsAlign) { 10557 // Align the overflow address 10558 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 10559 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 10560 10561 // aligned_addr = (addr + (align-1)) & ~(align-1) 10562 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 10563 .addReg(OverflowAddrReg) 10564 .addImm(Align-1); 10565 10566 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 10567 .addReg(TmpReg) 10568 .addImm(~(uint64_t)(Align-1)); 10569 } else { 10570 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 10571 .addReg(OverflowAddrReg); 10572 } 10573 10574 // Compute the next overflow address after this argument. 10575 // (the overflow address should be kept 8-byte aligned) 10576 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 10577 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 10578 .addReg(OverflowDestReg) 10579 .addImm(ArgSizeA8); 10580 10581 // Store the new overflow address. 10582 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 10583 .addOperand(Base) 10584 .addOperand(Scale) 10585 .addOperand(Index) 10586 .addDisp(Disp, 8) 10587 .addOperand(Segment) 10588 .addReg(NextAddrReg) 10589 .setMemRefs(MMOBegin, MMOEnd); 10590 10591 // If we branched, emit the PHI to the front of endMBB. 10592 if (offsetMBB) { 10593 BuildMI(*endMBB, endMBB->begin(), DL, 10594 TII->get(X86::PHI), DestReg) 10595 .addReg(OffsetDestReg).addMBB(offsetMBB) 10596 .addReg(OverflowDestReg).addMBB(overflowMBB); 10597 } 10598 10599 // Erase the pseudo instruction 10600 MI->eraseFromParent(); 10601 10602 return endMBB; 10603} 10604 10605MachineBasicBlock * 10606X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 10607 MachineInstr *MI, 10608 MachineBasicBlock *MBB) const { 10609 // Emit code to save XMM registers to the stack. The ABI says that the 10610 // number of registers to save is given in %al, so it's theoretically 10611 // possible to do an indirect jump trick to avoid saving all of them, 10612 // however this code takes a simpler approach and just executes all 10613 // of the stores if %al is non-zero. It's less code, and it's probably 10614 // easier on the hardware branch predictor, and stores aren't all that 10615 // expensive anyway. 10616 10617 // Create the new basic blocks. One block contains all the XMM stores, 10618 // and one block is the final destination regardless of whether any 10619 // stores were performed. 10620 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10621 MachineFunction *F = MBB->getParent(); 10622 MachineFunction::iterator MBBIter = MBB; 10623 ++MBBIter; 10624 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 10625 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 10626 F->insert(MBBIter, XMMSaveMBB); 10627 F->insert(MBBIter, EndMBB); 10628 10629 // Transfer the remainder of MBB and its successor edges to EndMBB. 10630 EndMBB->splice(EndMBB->begin(), MBB, 10631 llvm::next(MachineBasicBlock::iterator(MI)), 10632 MBB->end()); 10633 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 10634 10635 // The original block will now fall through to the XMM save block. 10636 MBB->addSuccessor(XMMSaveMBB); 10637 // The XMMSaveMBB will fall through to the end block. 10638 XMMSaveMBB->addSuccessor(EndMBB); 10639 10640 // Now add the instructions. 10641 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10642 DebugLoc DL = MI->getDebugLoc(); 10643 10644 unsigned CountReg = MI->getOperand(0).getReg(); 10645 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 10646 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 10647 10648 if (!Subtarget->isTargetWin64()) { 10649 // If %al is 0, branch around the XMM save block. 10650 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 10651 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 10652 MBB->addSuccessor(EndMBB); 10653 } 10654 10655 // In the XMM save block, save all the XMM argument registers. 10656 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 10657 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 10658 MachineMemOperand *MMO = 10659 F->getMachineMemOperand( 10660 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 10661 MachineMemOperand::MOStore, 10662 /*Size=*/16, /*Align=*/16); 10663 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 10664 .addFrameIndex(RegSaveFrameIndex) 10665 .addImm(/*Scale=*/1) 10666 .addReg(/*IndexReg=*/0) 10667 .addImm(/*Disp=*/Offset) 10668 .addReg(/*Segment=*/0) 10669 .addReg(MI->getOperand(i).getReg()) 10670 .addMemOperand(MMO); 10671 } 10672 10673 MI->eraseFromParent(); // The pseudo instruction is gone now. 10674 10675 return EndMBB; 10676} 10677 10678MachineBasicBlock * 10679X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10680 MachineBasicBlock *BB) const { 10681 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10682 DebugLoc DL = MI->getDebugLoc(); 10683 10684 // To "insert" a SELECT_CC instruction, we actually have to insert the 10685 // diamond control-flow pattern. The incoming instruction knows the 10686 // destination vreg to set, the condition code register to branch on, the 10687 // true/false values to select between, and a branch opcode to use. 10688 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10689 MachineFunction::iterator It = BB; 10690 ++It; 10691 10692 // thisMBB: 10693 // ... 10694 // TrueVal = ... 10695 // cmpTY ccX, r1, r2 10696 // bCC copy1MBB 10697 // fallthrough --> copy0MBB 10698 MachineBasicBlock *thisMBB = BB; 10699 MachineFunction *F = BB->getParent(); 10700 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10701 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10702 F->insert(It, copy0MBB); 10703 F->insert(It, sinkMBB); 10704 10705 // If the EFLAGS register isn't dead in the terminator, then claim that it's 10706 // live into the sink and copy blocks. 10707 const MachineFunction *MF = BB->getParent(); 10708 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 10709 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 10710 10711 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 10712 const MachineOperand &MO = MI->getOperand(I); 10713 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 10714 unsigned Reg = MO.getReg(); 10715 if (Reg != X86::EFLAGS) continue; 10716 copy0MBB->addLiveIn(Reg); 10717 sinkMBB->addLiveIn(Reg); 10718 } 10719 10720 // Transfer the remainder of BB and its successor edges to sinkMBB. 10721 sinkMBB->splice(sinkMBB->begin(), BB, 10722 llvm::next(MachineBasicBlock::iterator(MI)), 10723 BB->end()); 10724 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10725 10726 // Add the true and fallthrough blocks as its successors. 10727 BB->addSuccessor(copy0MBB); 10728 BB->addSuccessor(sinkMBB); 10729 10730 // Create the conditional branch instruction. 10731 unsigned Opc = 10732 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 10733 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 10734 10735 // copy0MBB: 10736 // %FalseValue = ... 10737 // # fallthrough to sinkMBB 10738 copy0MBB->addSuccessor(sinkMBB); 10739 10740 // sinkMBB: 10741 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10742 // ... 10743 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10744 TII->get(X86::PHI), MI->getOperand(0).getReg()) 10745 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 10746 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 10747 10748 MI->eraseFromParent(); // The pseudo instruction is gone now. 10749 return sinkMBB; 10750} 10751 10752MachineBasicBlock * 10753X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10754 MachineBasicBlock *BB) const { 10755 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10756 DebugLoc DL = MI->getDebugLoc(); 10757 10758 assert(!Subtarget->isTargetEnvMacho()); 10759 10760 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10761 // non-trivial part is impdef of ESP. 10762 10763 if (Subtarget->isTargetWin64()) { 10764 if (Subtarget->isTargetCygMing()) { 10765 // ___chkstk(Mingw64): 10766 // Clobbers R10, R11, RAX and EFLAGS. 10767 // Updates RSP. 10768 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10769 .addExternalSymbol("___chkstk") 10770 .addReg(X86::RAX, RegState::Implicit) 10771 .addReg(X86::RSP, RegState::Implicit) 10772 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 10773 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 10774 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10775 } else { 10776 // __chkstk(MSVCRT): does not update stack pointer. 10777 // Clobbers R10, R11 and EFLAGS. 10778 // FIXME: RAX(allocated size) might be reused and not killed. 10779 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10780 .addExternalSymbol("__chkstk") 10781 .addReg(X86::RAX, RegState::Implicit) 10782 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10783 // RAX has the offset to subtracted from RSP. 10784 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 10785 .addReg(X86::RSP) 10786 .addReg(X86::RAX); 10787 } 10788 } else { 10789 const char *StackProbeSymbol = 10790 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10791 10792 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10793 .addExternalSymbol(StackProbeSymbol) 10794 .addReg(X86::EAX, RegState::Implicit) 10795 .addReg(X86::ESP, RegState::Implicit) 10796 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10797 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10798 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10799 } 10800 10801 MI->eraseFromParent(); // The pseudo instruction is gone now. 10802 return BB; 10803} 10804 10805MachineBasicBlock * 10806X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10807 MachineBasicBlock *BB) const { 10808 // This is pretty easy. We're taking the value that we received from 10809 // our load from the relocation, sticking it in either RDI (x86-64) 10810 // or EAX and doing an indirect call. The return value will then 10811 // be in the normal return register. 10812 const X86InstrInfo *TII 10813 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10814 DebugLoc DL = MI->getDebugLoc(); 10815 MachineFunction *F = BB->getParent(); 10816 10817 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10818 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10819 10820 if (Subtarget->is64Bit()) { 10821 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10822 TII->get(X86::MOV64rm), X86::RDI) 10823 .addReg(X86::RIP) 10824 .addImm(0).addReg(0) 10825 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10826 MI->getOperand(3).getTargetFlags()) 10827 .addReg(0); 10828 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10829 addDirectMem(MIB, X86::RDI); 10830 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10831 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10832 TII->get(X86::MOV32rm), X86::EAX) 10833 .addReg(0) 10834 .addImm(0).addReg(0) 10835 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10836 MI->getOperand(3).getTargetFlags()) 10837 .addReg(0); 10838 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10839 addDirectMem(MIB, X86::EAX); 10840 } else { 10841 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10842 TII->get(X86::MOV32rm), X86::EAX) 10843 .addReg(TII->getGlobalBaseReg(F)) 10844 .addImm(0).addReg(0) 10845 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10846 MI->getOperand(3).getTargetFlags()) 10847 .addReg(0); 10848 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10849 addDirectMem(MIB, X86::EAX); 10850 } 10851 10852 MI->eraseFromParent(); // The pseudo instruction is gone now. 10853 return BB; 10854} 10855 10856MachineBasicBlock * 10857X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10858 MachineBasicBlock *BB) const { 10859 switch (MI->getOpcode()) { 10860 default: assert(false && "Unexpected instr type to insert"); 10861 case X86::TAILJMPd64: 10862 case X86::TAILJMPr64: 10863 case X86::TAILJMPm64: 10864 assert(!"TAILJMP64 would not be touched here."); 10865 case X86::TCRETURNdi64: 10866 case X86::TCRETURNri64: 10867 case X86::TCRETURNmi64: 10868 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 10869 // On AMD64, additional defs should be added before register allocation. 10870 if (!Subtarget->isTargetWin64()) { 10871 MI->addRegisterDefined(X86::RSI); 10872 MI->addRegisterDefined(X86::RDI); 10873 MI->addRegisterDefined(X86::XMM6); 10874 MI->addRegisterDefined(X86::XMM7); 10875 MI->addRegisterDefined(X86::XMM8); 10876 MI->addRegisterDefined(X86::XMM9); 10877 MI->addRegisterDefined(X86::XMM10); 10878 MI->addRegisterDefined(X86::XMM11); 10879 MI->addRegisterDefined(X86::XMM12); 10880 MI->addRegisterDefined(X86::XMM13); 10881 MI->addRegisterDefined(X86::XMM14); 10882 MI->addRegisterDefined(X86::XMM15); 10883 } 10884 return BB; 10885 case X86::WIN_ALLOCA: 10886 return EmitLoweredWinAlloca(MI, BB); 10887 case X86::TLSCall_32: 10888 case X86::TLSCall_64: 10889 return EmitLoweredTLSCall(MI, BB); 10890 case X86::CMOV_GR8: 10891 case X86::CMOV_FR32: 10892 case X86::CMOV_FR64: 10893 case X86::CMOV_V4F32: 10894 case X86::CMOV_V2F64: 10895 case X86::CMOV_V2I64: 10896 case X86::CMOV_GR16: 10897 case X86::CMOV_GR32: 10898 case X86::CMOV_RFP32: 10899 case X86::CMOV_RFP64: 10900 case X86::CMOV_RFP80: 10901 return EmitLoweredSelect(MI, BB); 10902 10903 case X86::FP32_TO_INT16_IN_MEM: 10904 case X86::FP32_TO_INT32_IN_MEM: 10905 case X86::FP32_TO_INT64_IN_MEM: 10906 case X86::FP64_TO_INT16_IN_MEM: 10907 case X86::FP64_TO_INT32_IN_MEM: 10908 case X86::FP64_TO_INT64_IN_MEM: 10909 case X86::FP80_TO_INT16_IN_MEM: 10910 case X86::FP80_TO_INT32_IN_MEM: 10911 case X86::FP80_TO_INT64_IN_MEM: { 10912 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10913 DebugLoc DL = MI->getDebugLoc(); 10914 10915 // Change the floating point control register to use "round towards zero" 10916 // mode when truncating to an integer value. 10917 MachineFunction *F = BB->getParent(); 10918 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10919 addFrameReference(BuildMI(*BB, MI, DL, 10920 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10921 10922 // Load the old value of the high byte of the control word... 10923 unsigned OldCW = 10924 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10925 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10926 CWFrameIdx); 10927 10928 // Set the high part to be round to zero... 10929 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10930 .addImm(0xC7F); 10931 10932 // Reload the modified control word now... 10933 addFrameReference(BuildMI(*BB, MI, DL, 10934 TII->get(X86::FLDCW16m)), CWFrameIdx); 10935 10936 // Restore the memory image of control word to original value 10937 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10938 .addReg(OldCW); 10939 10940 // Get the X86 opcode to use. 10941 unsigned Opc; 10942 switch (MI->getOpcode()) { 10943 default: llvm_unreachable("illegal opcode!"); 10944 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10945 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10946 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10947 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10948 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10949 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10950 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10951 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10952 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10953 } 10954 10955 X86AddressMode AM; 10956 MachineOperand &Op = MI->getOperand(0); 10957 if (Op.isReg()) { 10958 AM.BaseType = X86AddressMode::RegBase; 10959 AM.Base.Reg = Op.getReg(); 10960 } else { 10961 AM.BaseType = X86AddressMode::FrameIndexBase; 10962 AM.Base.FrameIndex = Op.getIndex(); 10963 } 10964 Op = MI->getOperand(1); 10965 if (Op.isImm()) 10966 AM.Scale = Op.getImm(); 10967 Op = MI->getOperand(2); 10968 if (Op.isImm()) 10969 AM.IndexReg = Op.getImm(); 10970 Op = MI->getOperand(3); 10971 if (Op.isGlobal()) { 10972 AM.GV = Op.getGlobal(); 10973 } else { 10974 AM.Disp = Op.getImm(); 10975 } 10976 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10977 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10978 10979 // Reload the original control word now. 10980 addFrameReference(BuildMI(*BB, MI, DL, 10981 TII->get(X86::FLDCW16m)), CWFrameIdx); 10982 10983 MI->eraseFromParent(); // The pseudo instruction is gone now. 10984 return BB; 10985 } 10986 // String/text processing lowering. 10987 case X86::PCMPISTRM128REG: 10988 case X86::VPCMPISTRM128REG: 10989 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10990 case X86::PCMPISTRM128MEM: 10991 case X86::VPCMPISTRM128MEM: 10992 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10993 case X86::PCMPESTRM128REG: 10994 case X86::VPCMPESTRM128REG: 10995 return EmitPCMP(MI, BB, 5, false /* in mem */); 10996 case X86::PCMPESTRM128MEM: 10997 case X86::VPCMPESTRM128MEM: 10998 return EmitPCMP(MI, BB, 5, true /* in mem */); 10999 11000 // Thread synchronization. 11001 case X86::MONITOR: 11002 return EmitMonitor(MI, BB); 11003 case X86::MWAIT: 11004 return EmitMwait(MI, BB); 11005 11006 // Atomic Lowering. 11007 case X86::ATOMAND32: 11008 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11009 X86::AND32ri, X86::MOV32rm, 11010 X86::LCMPXCHG32, 11011 X86::NOT32r, X86::EAX, 11012 X86::GR32RegisterClass); 11013 case X86::ATOMOR32: 11014 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 11015 X86::OR32ri, X86::MOV32rm, 11016 X86::LCMPXCHG32, 11017 X86::NOT32r, X86::EAX, 11018 X86::GR32RegisterClass); 11019 case X86::ATOMXOR32: 11020 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 11021 X86::XOR32ri, X86::MOV32rm, 11022 X86::LCMPXCHG32, 11023 X86::NOT32r, X86::EAX, 11024 X86::GR32RegisterClass); 11025 case X86::ATOMNAND32: 11026 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11027 X86::AND32ri, X86::MOV32rm, 11028 X86::LCMPXCHG32, 11029 X86::NOT32r, X86::EAX, 11030 X86::GR32RegisterClass, true); 11031 case X86::ATOMMIN32: 11032 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 11033 case X86::ATOMMAX32: 11034 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 11035 case X86::ATOMUMIN32: 11036 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 11037 case X86::ATOMUMAX32: 11038 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 11039 11040 case X86::ATOMAND16: 11041 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11042 X86::AND16ri, X86::MOV16rm, 11043 X86::LCMPXCHG16, 11044 X86::NOT16r, X86::AX, 11045 X86::GR16RegisterClass); 11046 case X86::ATOMOR16: 11047 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 11048 X86::OR16ri, X86::MOV16rm, 11049 X86::LCMPXCHG16, 11050 X86::NOT16r, X86::AX, 11051 X86::GR16RegisterClass); 11052 case X86::ATOMXOR16: 11053 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 11054 X86::XOR16ri, X86::MOV16rm, 11055 X86::LCMPXCHG16, 11056 X86::NOT16r, X86::AX, 11057 X86::GR16RegisterClass); 11058 case X86::ATOMNAND16: 11059 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11060 X86::AND16ri, X86::MOV16rm, 11061 X86::LCMPXCHG16, 11062 X86::NOT16r, X86::AX, 11063 X86::GR16RegisterClass, true); 11064 case X86::ATOMMIN16: 11065 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 11066 case X86::ATOMMAX16: 11067 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 11068 case X86::ATOMUMIN16: 11069 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 11070 case X86::ATOMUMAX16: 11071 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 11072 11073 case X86::ATOMAND8: 11074 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11075 X86::AND8ri, X86::MOV8rm, 11076 X86::LCMPXCHG8, 11077 X86::NOT8r, X86::AL, 11078 X86::GR8RegisterClass); 11079 case X86::ATOMOR8: 11080 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 11081 X86::OR8ri, X86::MOV8rm, 11082 X86::LCMPXCHG8, 11083 X86::NOT8r, X86::AL, 11084 X86::GR8RegisterClass); 11085 case X86::ATOMXOR8: 11086 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 11087 X86::XOR8ri, X86::MOV8rm, 11088 X86::LCMPXCHG8, 11089 X86::NOT8r, X86::AL, 11090 X86::GR8RegisterClass); 11091 case X86::ATOMNAND8: 11092 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11093 X86::AND8ri, X86::MOV8rm, 11094 X86::LCMPXCHG8, 11095 X86::NOT8r, X86::AL, 11096 X86::GR8RegisterClass, true); 11097 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 11098 // This group is for 64-bit host. 11099 case X86::ATOMAND64: 11100 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11101 X86::AND64ri32, X86::MOV64rm, 11102 X86::LCMPXCHG64, 11103 X86::NOT64r, X86::RAX, 11104 X86::GR64RegisterClass); 11105 case X86::ATOMOR64: 11106 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 11107 X86::OR64ri32, X86::MOV64rm, 11108 X86::LCMPXCHG64, 11109 X86::NOT64r, X86::RAX, 11110 X86::GR64RegisterClass); 11111 case X86::ATOMXOR64: 11112 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 11113 X86::XOR64ri32, X86::MOV64rm, 11114 X86::LCMPXCHG64, 11115 X86::NOT64r, X86::RAX, 11116 X86::GR64RegisterClass); 11117 case X86::ATOMNAND64: 11118 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11119 X86::AND64ri32, X86::MOV64rm, 11120 X86::LCMPXCHG64, 11121 X86::NOT64r, X86::RAX, 11122 X86::GR64RegisterClass, true); 11123 case X86::ATOMMIN64: 11124 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 11125 case X86::ATOMMAX64: 11126 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 11127 case X86::ATOMUMIN64: 11128 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 11129 case X86::ATOMUMAX64: 11130 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 11131 11132 // This group does 64-bit operations on a 32-bit host. 11133 case X86::ATOMAND6432: 11134 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11135 X86::AND32rr, X86::AND32rr, 11136 X86::AND32ri, X86::AND32ri, 11137 false); 11138 case X86::ATOMOR6432: 11139 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11140 X86::OR32rr, X86::OR32rr, 11141 X86::OR32ri, X86::OR32ri, 11142 false); 11143 case X86::ATOMXOR6432: 11144 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11145 X86::XOR32rr, X86::XOR32rr, 11146 X86::XOR32ri, X86::XOR32ri, 11147 false); 11148 case X86::ATOMNAND6432: 11149 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11150 X86::AND32rr, X86::AND32rr, 11151 X86::AND32ri, X86::AND32ri, 11152 true); 11153 case X86::ATOMADD6432: 11154 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11155 X86::ADD32rr, X86::ADC32rr, 11156 X86::ADD32ri, X86::ADC32ri, 11157 false); 11158 case X86::ATOMSUB6432: 11159 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11160 X86::SUB32rr, X86::SBB32rr, 11161 X86::SUB32ri, X86::SBB32ri, 11162 false); 11163 case X86::ATOMSWAP6432: 11164 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11165 X86::MOV32rr, X86::MOV32rr, 11166 X86::MOV32ri, X86::MOV32ri, 11167 false); 11168 case X86::VASTART_SAVE_XMM_REGS: 11169 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 11170 11171 case X86::VAARG_64: 11172 return EmitVAARG64WithCustomInserter(MI, BB); 11173 } 11174} 11175 11176//===----------------------------------------------------------------------===// 11177// X86 Optimization Hooks 11178//===----------------------------------------------------------------------===// 11179 11180void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 11181 const APInt &Mask, 11182 APInt &KnownZero, 11183 APInt &KnownOne, 11184 const SelectionDAG &DAG, 11185 unsigned Depth) const { 11186 unsigned Opc = Op.getOpcode(); 11187 assert((Opc >= ISD::BUILTIN_OP_END || 11188 Opc == ISD::INTRINSIC_WO_CHAIN || 11189 Opc == ISD::INTRINSIC_W_CHAIN || 11190 Opc == ISD::INTRINSIC_VOID) && 11191 "Should use MaskedValueIsZero if you don't know whether Op" 11192 " is a target node!"); 11193 11194 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 11195 switch (Opc) { 11196 default: break; 11197 case X86ISD::ADD: 11198 case X86ISD::SUB: 11199 case X86ISD::ADC: 11200 case X86ISD::SBB: 11201 case X86ISD::SMUL: 11202 case X86ISD::UMUL: 11203 case X86ISD::INC: 11204 case X86ISD::DEC: 11205 case X86ISD::OR: 11206 case X86ISD::XOR: 11207 case X86ISD::AND: 11208 // These nodes' second result is a boolean. 11209 if (Op.getResNo() == 0) 11210 break; 11211 // Fallthrough 11212 case X86ISD::SETCC: 11213 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 11214 Mask.getBitWidth() - 1); 11215 break; 11216 } 11217} 11218 11219unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 11220 unsigned Depth) const { 11221 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 11222 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 11223 return Op.getValueType().getScalarType().getSizeInBits(); 11224 11225 // Fallback case. 11226 return 1; 11227} 11228 11229/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 11230/// node is a GlobalAddress + offset. 11231bool X86TargetLowering::isGAPlusOffset(SDNode *N, 11232 const GlobalValue* &GA, 11233 int64_t &Offset) const { 11234 if (N->getOpcode() == X86ISD::Wrapper) { 11235 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 11236 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 11237 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 11238 return true; 11239 } 11240 } 11241 return TargetLowering::isGAPlusOffset(N, GA, Offset); 11242} 11243 11244/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 11245static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 11246 TargetLowering::DAGCombinerInfo &DCI) { 11247 DebugLoc dl = N->getDebugLoc(); 11248 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 11249 SDValue V1 = SVOp->getOperand(0); 11250 SDValue V2 = SVOp->getOperand(1); 11251 EVT VT = SVOp->getValueType(0); 11252 11253 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 11254 V2.getOpcode() == ISD::CONCAT_VECTORS) { 11255 // 11256 // 0,0,0,... 11257 // | 11258 // V UNDEF BUILD_VECTOR UNDEF 11259 // \ / \ / 11260 // CONCAT_VECTOR CONCAT_VECTOR 11261 // \ / 11262 // \ / 11263 // RESULT: V + zero extended 11264 // 11265 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 11266 V2.getOperand(1).getOpcode() != ISD::UNDEF || 11267 V1.getOperand(1).getOpcode() != ISD::UNDEF) 11268 return SDValue(); 11269 11270 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 11271 return SDValue(); 11272 11273 // To match the shuffle mask, the first half of the mask should 11274 // be exactly the first vector, and all the rest a splat with the 11275 // first element of the second one. 11276 int NumElems = VT.getVectorNumElements(); 11277 for (int i = 0; i < NumElems/2; ++i) 11278 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 11279 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 11280 return SDValue(); 11281 11282 // Emit a zeroed vector and insert the desired subvector on its 11283 // first half. 11284 SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); 11285 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 11286 DAG.getConstant(0, MVT::i32), DAG, dl); 11287 return DCI.CombineTo(N, InsV); 11288 } 11289 11290 return SDValue(); 11291} 11292 11293/// PerformShuffleCombine - Performs several different shuffle combines. 11294static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 11295 TargetLowering::DAGCombinerInfo &DCI) { 11296 DebugLoc dl = N->getDebugLoc(); 11297 EVT VT = N->getValueType(0); 11298 11299 // Don't create instructions with illegal types after legalize types has run. 11300 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11301 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 11302 return SDValue(); 11303 11304 // Only handle pure VECTOR_SHUFFLE nodes. 11305 if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) 11306 return PerformShuffleCombine256(N, DAG, DCI); 11307 11308 // Only handle 128 wide vector from here on. 11309 if (VT.getSizeInBits() != 128) 11310 return SDValue(); 11311 11312 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 11313 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 11314 // consecutive, non-overlapping, and in the right order. 11315 SmallVector<SDValue, 16> Elts; 11316 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 11317 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 11318 11319 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 11320} 11321 11322/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 11323/// generation and convert it from being a bunch of shuffles and extracts 11324/// to a simple store and scalar loads to extract the elements. 11325static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 11326 const TargetLowering &TLI) { 11327 SDValue InputVector = N->getOperand(0); 11328 11329 // Only operate on vectors of 4 elements, where the alternative shuffling 11330 // gets to be more expensive. 11331 if (InputVector.getValueType() != MVT::v4i32) 11332 return SDValue(); 11333 11334 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 11335 // single use which is a sign-extend or zero-extend, and all elements are 11336 // used. 11337 SmallVector<SDNode *, 4> Uses; 11338 unsigned ExtractedElements = 0; 11339 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 11340 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 11341 if (UI.getUse().getResNo() != InputVector.getResNo()) 11342 return SDValue(); 11343 11344 SDNode *Extract = *UI; 11345 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11346 return SDValue(); 11347 11348 if (Extract->getValueType(0) != MVT::i32) 11349 return SDValue(); 11350 if (!Extract->hasOneUse()) 11351 return SDValue(); 11352 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 11353 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 11354 return SDValue(); 11355 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 11356 return SDValue(); 11357 11358 // Record which element was extracted. 11359 ExtractedElements |= 11360 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 11361 11362 Uses.push_back(Extract); 11363 } 11364 11365 // If not all the elements were used, this may not be worthwhile. 11366 if (ExtractedElements != 15) 11367 return SDValue(); 11368 11369 // Ok, we've now decided to do the transformation. 11370 DebugLoc dl = InputVector.getDebugLoc(); 11371 11372 // Store the value to a temporary stack slot. 11373 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 11374 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 11375 MachinePointerInfo(), false, false, 0); 11376 11377 // Replace each use (extract) with a load of the appropriate element. 11378 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 11379 UE = Uses.end(); UI != UE; ++UI) { 11380 SDNode *Extract = *UI; 11381 11382 // cOMpute the element's address. 11383 SDValue Idx = Extract->getOperand(1); 11384 unsigned EltSize = 11385 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 11386 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 11387 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 11388 11389 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 11390 StackPtr, OffsetVal); 11391 11392 // Load the scalar. 11393 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 11394 ScalarAddr, MachinePointerInfo(), 11395 false, false, 0); 11396 11397 // Replace the exact with the load. 11398 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 11399 } 11400 11401 // The replacement was made in place; don't return anything. 11402 return SDValue(); 11403} 11404 11405/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 11406static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 11407 const X86Subtarget *Subtarget) { 11408 DebugLoc DL = N->getDebugLoc(); 11409 SDValue Cond = N->getOperand(0); 11410 // Get the LHS/RHS of the select. 11411 SDValue LHS = N->getOperand(1); 11412 SDValue RHS = N->getOperand(2); 11413 11414 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 11415 // instructions match the semantics of the common C idiom x<y?x:y but not 11416 // x<=y?x:y, because of how they handle negative zero (which can be 11417 // ignored in unsafe-math mode). 11418 if (Subtarget->hasSSE2() && 11419 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 11420 Cond.getOpcode() == ISD::SETCC) { 11421 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 11422 11423 unsigned Opcode = 0; 11424 // Check for x CC y ? x : y. 11425 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 11426 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 11427 switch (CC) { 11428 default: break; 11429 case ISD::SETULT: 11430 // Converting this to a min would handle NaNs incorrectly, and swapping 11431 // the operands would cause it to handle comparisons between positive 11432 // and negative zero incorrectly. 11433 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11434 if (!UnsafeFPMath && 11435 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11436 break; 11437 std::swap(LHS, RHS); 11438 } 11439 Opcode = X86ISD::FMIN; 11440 break; 11441 case ISD::SETOLE: 11442 // Converting this to a min would handle comparisons between positive 11443 // and negative zero incorrectly. 11444 if (!UnsafeFPMath && 11445 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11446 break; 11447 Opcode = X86ISD::FMIN; 11448 break; 11449 case ISD::SETULE: 11450 // Converting this to a min would handle both negative zeros and NaNs 11451 // incorrectly, but we can swap the operands to fix both. 11452 std::swap(LHS, RHS); 11453 case ISD::SETOLT: 11454 case ISD::SETLT: 11455 case ISD::SETLE: 11456 Opcode = X86ISD::FMIN; 11457 break; 11458 11459 case ISD::SETOGE: 11460 // Converting this to a max would handle comparisons between positive 11461 // and negative zero incorrectly. 11462 if (!UnsafeFPMath && 11463 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 11464 break; 11465 Opcode = X86ISD::FMAX; 11466 break; 11467 case ISD::SETUGT: 11468 // Converting this to a max would handle NaNs incorrectly, and swapping 11469 // the operands would cause it to handle comparisons between positive 11470 // and negative zero incorrectly. 11471 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11472 if (!UnsafeFPMath && 11473 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11474 break; 11475 std::swap(LHS, RHS); 11476 } 11477 Opcode = X86ISD::FMAX; 11478 break; 11479 case ISD::SETUGE: 11480 // Converting this to a max would handle both negative zeros and NaNs 11481 // incorrectly, but we can swap the operands to fix both. 11482 std::swap(LHS, RHS); 11483 case ISD::SETOGT: 11484 case ISD::SETGT: 11485 case ISD::SETGE: 11486 Opcode = X86ISD::FMAX; 11487 break; 11488 } 11489 // Check for x CC y ? y : x -- a min/max with reversed arms. 11490 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 11491 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 11492 switch (CC) { 11493 default: break; 11494 case ISD::SETOGE: 11495 // Converting this to a min would handle comparisons between positive 11496 // and negative zero incorrectly, and swapping the operands would 11497 // cause it to handle NaNs incorrectly. 11498 if (!UnsafeFPMath && 11499 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 11500 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11501 break; 11502 std::swap(LHS, RHS); 11503 } 11504 Opcode = X86ISD::FMIN; 11505 break; 11506 case ISD::SETUGT: 11507 // Converting this to a min would handle NaNs incorrectly. 11508 if (!UnsafeFPMath && 11509 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 11510 break; 11511 Opcode = X86ISD::FMIN; 11512 break; 11513 case ISD::SETUGE: 11514 // Converting this to a min would handle both negative zeros and NaNs 11515 // incorrectly, but we can swap the operands to fix both. 11516 std::swap(LHS, RHS); 11517 case ISD::SETOGT: 11518 case ISD::SETGT: 11519 case ISD::SETGE: 11520 Opcode = X86ISD::FMIN; 11521 break; 11522 11523 case ISD::SETULT: 11524 // Converting this to a max would handle NaNs incorrectly. 11525 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11526 break; 11527 Opcode = X86ISD::FMAX; 11528 break; 11529 case ISD::SETOLE: 11530 // Converting this to a max would handle comparisons between positive 11531 // and negative zero incorrectly, and swapping the operands would 11532 // cause it to handle NaNs incorrectly. 11533 if (!UnsafeFPMath && 11534 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 11535 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11536 break; 11537 std::swap(LHS, RHS); 11538 } 11539 Opcode = X86ISD::FMAX; 11540 break; 11541 case ISD::SETULE: 11542 // Converting this to a max would handle both negative zeros and NaNs 11543 // incorrectly, but we can swap the operands to fix both. 11544 std::swap(LHS, RHS); 11545 case ISD::SETOLT: 11546 case ISD::SETLT: 11547 case ISD::SETLE: 11548 Opcode = X86ISD::FMAX; 11549 break; 11550 } 11551 } 11552 11553 if (Opcode) 11554 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 11555 } 11556 11557 // If this is a select between two integer constants, try to do some 11558 // optimizations. 11559 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 11560 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 11561 // Don't do this for crazy integer types. 11562 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 11563 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 11564 // so that TrueC (the true value) is larger than FalseC. 11565 bool NeedsCondInvert = false; 11566 11567 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 11568 // Efficiently invertible. 11569 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 11570 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 11571 isa<ConstantSDNode>(Cond.getOperand(1))))) { 11572 NeedsCondInvert = true; 11573 std::swap(TrueC, FalseC); 11574 } 11575 11576 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 11577 if (FalseC->getAPIntValue() == 0 && 11578 TrueC->getAPIntValue().isPowerOf2()) { 11579 if (NeedsCondInvert) // Invert the condition if needed. 11580 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11581 DAG.getConstant(1, Cond.getValueType())); 11582 11583 // Zero extend the condition if needed. 11584 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 11585 11586 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11587 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 11588 DAG.getConstant(ShAmt, MVT::i8)); 11589 } 11590 11591 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 11592 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11593 if (NeedsCondInvert) // Invert the condition if needed. 11594 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11595 DAG.getConstant(1, Cond.getValueType())); 11596 11597 // Zero extend the condition if needed. 11598 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11599 FalseC->getValueType(0), Cond); 11600 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11601 SDValue(FalseC, 0)); 11602 } 11603 11604 // Optimize cases that will turn into an LEA instruction. This requires 11605 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11606 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11607 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11608 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11609 11610 bool isFastMultiplier = false; 11611 if (Diff < 10) { 11612 switch ((unsigned char)Diff) { 11613 default: break; 11614 case 1: // result = add base, cond 11615 case 2: // result = lea base( , cond*2) 11616 case 3: // result = lea base(cond, cond*2) 11617 case 4: // result = lea base( , cond*4) 11618 case 5: // result = lea base(cond, cond*4) 11619 case 8: // result = lea base( , cond*8) 11620 case 9: // result = lea base(cond, cond*8) 11621 isFastMultiplier = true; 11622 break; 11623 } 11624 } 11625 11626 if (isFastMultiplier) { 11627 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11628 if (NeedsCondInvert) // Invert the condition if needed. 11629 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11630 DAG.getConstant(1, Cond.getValueType())); 11631 11632 // Zero extend the condition if needed. 11633 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11634 Cond); 11635 // Scale the condition by the difference. 11636 if (Diff != 1) 11637 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11638 DAG.getConstant(Diff, Cond.getValueType())); 11639 11640 // Add the base if non-zero. 11641 if (FalseC->getAPIntValue() != 0) 11642 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11643 SDValue(FalseC, 0)); 11644 return Cond; 11645 } 11646 } 11647 } 11648 } 11649 11650 return SDValue(); 11651} 11652 11653/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 11654static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 11655 TargetLowering::DAGCombinerInfo &DCI) { 11656 DebugLoc DL = N->getDebugLoc(); 11657 11658 // If the flag operand isn't dead, don't touch this CMOV. 11659 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 11660 return SDValue(); 11661 11662 SDValue FalseOp = N->getOperand(0); 11663 SDValue TrueOp = N->getOperand(1); 11664 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 11665 SDValue Cond = N->getOperand(3); 11666 if (CC == X86::COND_E || CC == X86::COND_NE) { 11667 switch (Cond.getOpcode()) { 11668 default: break; 11669 case X86ISD::BSR: 11670 case X86ISD::BSF: 11671 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 11672 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 11673 return (CC == X86::COND_E) ? FalseOp : TrueOp; 11674 } 11675 } 11676 11677 // If this is a select between two integer constants, try to do some 11678 // optimizations. Note that the operands are ordered the opposite of SELECT 11679 // operands. 11680 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 11681 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 11682 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 11683 // larger than FalseC (the false value). 11684 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 11685 CC = X86::GetOppositeBranchCondition(CC); 11686 std::swap(TrueC, FalseC); 11687 } 11688 11689 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 11690 // This is efficient for any integer data type (including i8/i16) and 11691 // shift amount. 11692 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 11693 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11694 DAG.getConstant(CC, MVT::i8), Cond); 11695 11696 // Zero extend the condition if needed. 11697 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 11698 11699 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11700 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 11701 DAG.getConstant(ShAmt, MVT::i8)); 11702 if (N->getNumValues() == 2) // Dead flag value? 11703 return DCI.CombineTo(N, Cond, SDValue()); 11704 return Cond; 11705 } 11706 11707 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 11708 // for any integer data type, including i8/i16. 11709 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11710 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11711 DAG.getConstant(CC, MVT::i8), Cond); 11712 11713 // Zero extend the condition if needed. 11714 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11715 FalseC->getValueType(0), Cond); 11716 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11717 SDValue(FalseC, 0)); 11718 11719 if (N->getNumValues() == 2) // Dead flag value? 11720 return DCI.CombineTo(N, Cond, SDValue()); 11721 return Cond; 11722 } 11723 11724 // Optimize cases that will turn into an LEA instruction. This requires 11725 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11726 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11727 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11728 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11729 11730 bool isFastMultiplier = false; 11731 if (Diff < 10) { 11732 switch ((unsigned char)Diff) { 11733 default: break; 11734 case 1: // result = add base, cond 11735 case 2: // result = lea base( , cond*2) 11736 case 3: // result = lea base(cond, cond*2) 11737 case 4: // result = lea base( , cond*4) 11738 case 5: // result = lea base(cond, cond*4) 11739 case 8: // result = lea base( , cond*8) 11740 case 9: // result = lea base(cond, cond*8) 11741 isFastMultiplier = true; 11742 break; 11743 } 11744 } 11745 11746 if (isFastMultiplier) { 11747 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11748 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11749 DAG.getConstant(CC, MVT::i8), Cond); 11750 // Zero extend the condition if needed. 11751 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11752 Cond); 11753 // Scale the condition by the difference. 11754 if (Diff != 1) 11755 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11756 DAG.getConstant(Diff, Cond.getValueType())); 11757 11758 // Add the base if non-zero. 11759 if (FalseC->getAPIntValue() != 0) 11760 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11761 SDValue(FalseC, 0)); 11762 if (N->getNumValues() == 2) // Dead flag value? 11763 return DCI.CombineTo(N, Cond, SDValue()); 11764 return Cond; 11765 } 11766 } 11767 } 11768 } 11769 return SDValue(); 11770} 11771 11772 11773/// PerformMulCombine - Optimize a single multiply with constant into two 11774/// in order to implement it with two cheaper instructions, e.g. 11775/// LEA + SHL, LEA + LEA. 11776static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 11777 TargetLowering::DAGCombinerInfo &DCI) { 11778 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11779 return SDValue(); 11780 11781 EVT VT = N->getValueType(0); 11782 if (VT != MVT::i64) 11783 return SDValue(); 11784 11785 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11786 if (!C) 11787 return SDValue(); 11788 uint64_t MulAmt = C->getZExtValue(); 11789 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 11790 return SDValue(); 11791 11792 uint64_t MulAmt1 = 0; 11793 uint64_t MulAmt2 = 0; 11794 if ((MulAmt % 9) == 0) { 11795 MulAmt1 = 9; 11796 MulAmt2 = MulAmt / 9; 11797 } else if ((MulAmt % 5) == 0) { 11798 MulAmt1 = 5; 11799 MulAmt2 = MulAmt / 5; 11800 } else if ((MulAmt % 3) == 0) { 11801 MulAmt1 = 3; 11802 MulAmt2 = MulAmt / 3; 11803 } 11804 if (MulAmt2 && 11805 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 11806 DebugLoc DL = N->getDebugLoc(); 11807 11808 if (isPowerOf2_64(MulAmt2) && 11809 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 11810 // If second multiplifer is pow2, issue it first. We want the multiply by 11811 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 11812 // is an add. 11813 std::swap(MulAmt1, MulAmt2); 11814 11815 SDValue NewMul; 11816 if (isPowerOf2_64(MulAmt1)) 11817 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 11818 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 11819 else 11820 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 11821 DAG.getConstant(MulAmt1, VT)); 11822 11823 if (isPowerOf2_64(MulAmt2)) 11824 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 11825 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 11826 else 11827 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 11828 DAG.getConstant(MulAmt2, VT)); 11829 11830 // Do not add new nodes to DAG combiner worklist. 11831 DCI.CombineTo(N, NewMul, false); 11832 } 11833 return SDValue(); 11834} 11835 11836static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 11837 SDValue N0 = N->getOperand(0); 11838 SDValue N1 = N->getOperand(1); 11839 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11840 EVT VT = N0.getValueType(); 11841 11842 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11843 // since the result of setcc_c is all zero's or all ones. 11844 if (N1C && N0.getOpcode() == ISD::AND && 11845 N0.getOperand(1).getOpcode() == ISD::Constant) { 11846 SDValue N00 = N0.getOperand(0); 11847 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11848 ((N00.getOpcode() == ISD::ANY_EXTEND || 11849 N00.getOpcode() == ISD::ZERO_EXTEND) && 11850 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11851 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11852 APInt ShAmt = N1C->getAPIntValue(); 11853 Mask = Mask.shl(ShAmt); 11854 if (Mask != 0) 11855 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11856 N00, DAG.getConstant(Mask, VT)); 11857 } 11858 } 11859 11860 return SDValue(); 11861} 11862 11863/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 11864/// when possible. 11865static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 11866 const X86Subtarget *Subtarget) { 11867 EVT VT = N->getValueType(0); 11868 if (!VT.isVector() && VT.isInteger() && 11869 N->getOpcode() == ISD::SHL) 11870 return PerformSHLCombine(N, DAG); 11871 11872 // On X86 with SSE2 support, we can transform this to a vector shift if 11873 // all elements are shifted by the same amount. We can't do this in legalize 11874 // because the a constant vector is typically transformed to a constant pool 11875 // so we have no knowledge of the shift amount. 11876 if (!Subtarget->hasSSE2()) 11877 return SDValue(); 11878 11879 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 11880 return SDValue(); 11881 11882 SDValue ShAmtOp = N->getOperand(1); 11883 EVT EltVT = VT.getVectorElementType(); 11884 DebugLoc DL = N->getDebugLoc(); 11885 SDValue BaseShAmt = SDValue(); 11886 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 11887 unsigned NumElts = VT.getVectorNumElements(); 11888 unsigned i = 0; 11889 for (; i != NumElts; ++i) { 11890 SDValue Arg = ShAmtOp.getOperand(i); 11891 if (Arg.getOpcode() == ISD::UNDEF) continue; 11892 BaseShAmt = Arg; 11893 break; 11894 } 11895 for (; i != NumElts; ++i) { 11896 SDValue Arg = ShAmtOp.getOperand(i); 11897 if (Arg.getOpcode() == ISD::UNDEF) continue; 11898 if (Arg != BaseShAmt) { 11899 return SDValue(); 11900 } 11901 } 11902 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 11903 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 11904 SDValue InVec = ShAmtOp.getOperand(0); 11905 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 11906 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 11907 unsigned i = 0; 11908 for (; i != NumElts; ++i) { 11909 SDValue Arg = InVec.getOperand(i); 11910 if (Arg.getOpcode() == ISD::UNDEF) continue; 11911 BaseShAmt = Arg; 11912 break; 11913 } 11914 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 11915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 11916 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 11917 if (C->getZExtValue() == SplatIdx) 11918 BaseShAmt = InVec.getOperand(1); 11919 } 11920 } 11921 if (BaseShAmt.getNode() == 0) 11922 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 11923 DAG.getIntPtrConstant(0)); 11924 } else 11925 return SDValue(); 11926 11927 // The shift amount is an i32. 11928 if (EltVT.bitsGT(MVT::i32)) 11929 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 11930 else if (EltVT.bitsLT(MVT::i32)) 11931 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 11932 11933 // The shift amount is identical so we can do a vector shift. 11934 SDValue ValOp = N->getOperand(0); 11935 switch (N->getOpcode()) { 11936 default: 11937 llvm_unreachable("Unknown shift opcode!"); 11938 break; 11939 case ISD::SHL: 11940 if (VT == MVT::v2i64) 11941 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11942 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11943 ValOp, BaseShAmt); 11944 if (VT == MVT::v4i32) 11945 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11946 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11947 ValOp, BaseShAmt); 11948 if (VT == MVT::v8i16) 11949 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11950 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11951 ValOp, BaseShAmt); 11952 break; 11953 case ISD::SRA: 11954 if (VT == MVT::v4i32) 11955 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11956 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11957 ValOp, BaseShAmt); 11958 if (VT == MVT::v8i16) 11959 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11960 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11961 ValOp, BaseShAmt); 11962 break; 11963 case ISD::SRL: 11964 if (VT == MVT::v2i64) 11965 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11966 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11967 ValOp, BaseShAmt); 11968 if (VT == MVT::v4i32) 11969 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11970 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11971 ValOp, BaseShAmt); 11972 if (VT == MVT::v8i16) 11973 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11974 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11975 ValOp, BaseShAmt); 11976 break; 11977 } 11978 return SDValue(); 11979} 11980 11981 11982// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 11983// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 11984// and friends. Likewise for OR -> CMPNEQSS. 11985static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 11986 TargetLowering::DAGCombinerInfo &DCI, 11987 const X86Subtarget *Subtarget) { 11988 unsigned opcode; 11989 11990 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 11991 // we're requiring SSE2 for both. 11992 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 11993 SDValue N0 = N->getOperand(0); 11994 SDValue N1 = N->getOperand(1); 11995 SDValue CMP0 = N0->getOperand(1); 11996 SDValue CMP1 = N1->getOperand(1); 11997 DebugLoc DL = N->getDebugLoc(); 11998 11999 // The SETCCs should both refer to the same CMP. 12000 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 12001 return SDValue(); 12002 12003 SDValue CMP00 = CMP0->getOperand(0); 12004 SDValue CMP01 = CMP0->getOperand(1); 12005 EVT VT = CMP00.getValueType(); 12006 12007 if (VT == MVT::f32 || VT == MVT::f64) { 12008 bool ExpectingFlags = false; 12009 // Check for any users that want flags: 12010 for (SDNode::use_iterator UI = N->use_begin(), 12011 UE = N->use_end(); 12012 !ExpectingFlags && UI != UE; ++UI) 12013 switch (UI->getOpcode()) { 12014 default: 12015 case ISD::BR_CC: 12016 case ISD::BRCOND: 12017 case ISD::SELECT: 12018 ExpectingFlags = true; 12019 break; 12020 case ISD::CopyToReg: 12021 case ISD::SIGN_EXTEND: 12022 case ISD::ZERO_EXTEND: 12023 case ISD::ANY_EXTEND: 12024 break; 12025 } 12026 12027 if (!ExpectingFlags) { 12028 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 12029 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 12030 12031 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 12032 X86::CondCode tmp = cc0; 12033 cc0 = cc1; 12034 cc1 = tmp; 12035 } 12036 12037 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 12038 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 12039 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 12040 X86ISD::NodeType NTOperator = is64BitFP ? 12041 X86ISD::FSETCCsd : X86ISD::FSETCCss; 12042 // FIXME: need symbolic constants for these magic numbers. 12043 // See X86ATTInstPrinter.cpp:printSSECC(). 12044 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 12045 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 12046 DAG.getConstant(x86cc, MVT::i8)); 12047 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 12048 OnesOrZeroesF); 12049 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 12050 DAG.getConstant(1, MVT::i32)); 12051 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 12052 return OneBitOfTruth; 12053 } 12054 } 12055 } 12056 } 12057 return SDValue(); 12058} 12059 12060/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 12061/// so it can be folded inside ANDNP. 12062static bool CanFoldXORWithAllOnes(const SDNode *N) { 12063 EVT VT = N->getValueType(0); 12064 12065 // Match direct AllOnes for 128 and 256-bit vectors 12066 if (ISD::isBuildVectorAllOnes(N)) 12067 return true; 12068 12069 // Look through a bit convert. 12070 if (N->getOpcode() == ISD::BITCAST) 12071 N = N->getOperand(0).getNode(); 12072 12073 // Sometimes the operand may come from a insert_subvector building a 256-bit 12074 // allones vector 12075 SDValue V1 = N->getOperand(0); 12076 SDValue V2 = N->getOperand(1); 12077 12078 if (VT.getSizeInBits() == 256 && 12079 N->getOpcode() == ISD::INSERT_SUBVECTOR && 12080 V1.getOpcode() == ISD::INSERT_SUBVECTOR && 12081 V1.getOperand(0).getOpcode() == ISD::UNDEF && 12082 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 12083 ISD::isBuildVectorAllOnes(V2.getNode())) 12084 return true; 12085 12086 return false; 12087} 12088 12089static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 12090 TargetLowering::DAGCombinerInfo &DCI, 12091 const X86Subtarget *Subtarget) { 12092 if (DCI.isBeforeLegalizeOps()) 12093 return SDValue(); 12094 12095 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12096 if (R.getNode()) 12097 return R; 12098 12099 // Want to form ANDNP nodes: 12100 // 1) In the hopes of then easily combining them with OR and AND nodes 12101 // to form PBLEND/PSIGN. 12102 // 2) To match ANDN packed intrinsics 12103 EVT VT = N->getValueType(0); 12104 if (VT != MVT::v2i64 && VT != MVT::v4i64) 12105 return SDValue(); 12106 12107 SDValue N0 = N->getOperand(0); 12108 SDValue N1 = N->getOperand(1); 12109 DebugLoc DL = N->getDebugLoc(); 12110 12111 // Check LHS for vnot 12112 if (N0.getOpcode() == ISD::XOR && 12113 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 12114 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 12115 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 12116 12117 // Check RHS for vnot 12118 if (N1.getOpcode() == ISD::XOR && 12119 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 12120 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 12121 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 12122 12123 return SDValue(); 12124} 12125 12126static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 12127 TargetLowering::DAGCombinerInfo &DCI, 12128 const X86Subtarget *Subtarget) { 12129 if (DCI.isBeforeLegalizeOps()) 12130 return SDValue(); 12131 12132 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12133 if (R.getNode()) 12134 return R; 12135 12136 EVT VT = N->getValueType(0); 12137 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 12138 return SDValue(); 12139 12140 SDValue N0 = N->getOperand(0); 12141 SDValue N1 = N->getOperand(1); 12142 12143 // look for psign/blend 12144 if (Subtarget->hasSSSE3()) { 12145 if (VT == MVT::v2i64) { 12146 // Canonicalize pandn to RHS 12147 if (N0.getOpcode() == X86ISD::ANDNP) 12148 std::swap(N0, N1); 12149 // or (and (m, x), (pandn m, y)) 12150 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 12151 SDValue Mask = N1.getOperand(0); 12152 SDValue X = N1.getOperand(1); 12153 SDValue Y; 12154 if (N0.getOperand(0) == Mask) 12155 Y = N0.getOperand(1); 12156 if (N0.getOperand(1) == Mask) 12157 Y = N0.getOperand(0); 12158 12159 // Check to see if the mask appeared in both the AND and ANDNP and 12160 if (!Y.getNode()) 12161 return SDValue(); 12162 12163 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 12164 if (Mask.getOpcode() != ISD::BITCAST || 12165 X.getOpcode() != ISD::BITCAST || 12166 Y.getOpcode() != ISD::BITCAST) 12167 return SDValue(); 12168 12169 // Look through mask bitcast. 12170 Mask = Mask.getOperand(0); 12171 EVT MaskVT = Mask.getValueType(); 12172 12173 // Validate that the Mask operand is a vector sra node. The sra node 12174 // will be an intrinsic. 12175 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 12176 return SDValue(); 12177 12178 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 12179 // there is no psrai.b 12180 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 12181 case Intrinsic::x86_sse2_psrai_w: 12182 case Intrinsic::x86_sse2_psrai_d: 12183 break; 12184 default: return SDValue(); 12185 } 12186 12187 // Check that the SRA is all signbits. 12188 SDValue SraC = Mask.getOperand(2); 12189 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 12190 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 12191 if ((SraAmt + 1) != EltBits) 12192 return SDValue(); 12193 12194 DebugLoc DL = N->getDebugLoc(); 12195 12196 // Now we know we at least have a plendvb with the mask val. See if 12197 // we can form a psignb/w/d. 12198 // psign = x.type == y.type == mask.type && y = sub(0, x); 12199 X = X.getOperand(0); 12200 Y = Y.getOperand(0); 12201 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 12202 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 12203 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 12204 unsigned Opc = 0; 12205 switch (EltBits) { 12206 case 8: Opc = X86ISD::PSIGNB; break; 12207 case 16: Opc = X86ISD::PSIGNW; break; 12208 case 32: Opc = X86ISD::PSIGND; break; 12209 default: break; 12210 } 12211 if (Opc) { 12212 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 12213 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 12214 } 12215 } 12216 // PBLENDVB only available on SSE 4.1 12217 if (!Subtarget->hasSSE41()) 12218 return SDValue(); 12219 12220 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 12221 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 12222 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 12223 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 12224 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 12225 } 12226 } 12227 } 12228 12229 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 12230 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 12231 std::swap(N0, N1); 12232 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 12233 return SDValue(); 12234 if (!N0.hasOneUse() || !N1.hasOneUse()) 12235 return SDValue(); 12236 12237 SDValue ShAmt0 = N0.getOperand(1); 12238 if (ShAmt0.getValueType() != MVT::i8) 12239 return SDValue(); 12240 SDValue ShAmt1 = N1.getOperand(1); 12241 if (ShAmt1.getValueType() != MVT::i8) 12242 return SDValue(); 12243 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 12244 ShAmt0 = ShAmt0.getOperand(0); 12245 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 12246 ShAmt1 = ShAmt1.getOperand(0); 12247 12248 DebugLoc DL = N->getDebugLoc(); 12249 unsigned Opc = X86ISD::SHLD; 12250 SDValue Op0 = N0.getOperand(0); 12251 SDValue Op1 = N1.getOperand(0); 12252 if (ShAmt0.getOpcode() == ISD::SUB) { 12253 Opc = X86ISD::SHRD; 12254 std::swap(Op0, Op1); 12255 std::swap(ShAmt0, ShAmt1); 12256 } 12257 12258 unsigned Bits = VT.getSizeInBits(); 12259 if (ShAmt1.getOpcode() == ISD::SUB) { 12260 SDValue Sum = ShAmt1.getOperand(0); 12261 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 12262 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 12263 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 12264 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 12265 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 12266 return DAG.getNode(Opc, DL, VT, 12267 Op0, Op1, 12268 DAG.getNode(ISD::TRUNCATE, DL, 12269 MVT::i8, ShAmt0)); 12270 } 12271 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 12272 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 12273 if (ShAmt0C && 12274 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 12275 return DAG.getNode(Opc, DL, VT, 12276 N0.getOperand(0), N1.getOperand(0), 12277 DAG.getNode(ISD::TRUNCATE, DL, 12278 MVT::i8, ShAmt0)); 12279 } 12280 12281 return SDValue(); 12282} 12283 12284/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 12285static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 12286 const X86Subtarget *Subtarget) { 12287 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 12288 // the FP state in cases where an emms may be missing. 12289 // A preferable solution to the general problem is to figure out the right 12290 // places to insert EMMS. This qualifies as a quick hack. 12291 12292 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 12293 StoreSDNode *St = cast<StoreSDNode>(N); 12294 EVT VT = St->getValue().getValueType(); 12295 if (VT.getSizeInBits() != 64) 12296 return SDValue(); 12297 12298 const Function *F = DAG.getMachineFunction().getFunction(); 12299 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 12300 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 12301 && Subtarget->hasSSE2(); 12302 if ((VT.isVector() || 12303 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 12304 isa<LoadSDNode>(St->getValue()) && 12305 !cast<LoadSDNode>(St->getValue())->isVolatile() && 12306 St->getChain().hasOneUse() && !St->isVolatile()) { 12307 SDNode* LdVal = St->getValue().getNode(); 12308 LoadSDNode *Ld = 0; 12309 int TokenFactorIndex = -1; 12310 SmallVector<SDValue, 8> Ops; 12311 SDNode* ChainVal = St->getChain().getNode(); 12312 // Must be a store of a load. We currently handle two cases: the load 12313 // is a direct child, and it's under an intervening TokenFactor. It is 12314 // possible to dig deeper under nested TokenFactors. 12315 if (ChainVal == LdVal) 12316 Ld = cast<LoadSDNode>(St->getChain()); 12317 else if (St->getValue().hasOneUse() && 12318 ChainVal->getOpcode() == ISD::TokenFactor) { 12319 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 12320 if (ChainVal->getOperand(i).getNode() == LdVal) { 12321 TokenFactorIndex = i; 12322 Ld = cast<LoadSDNode>(St->getValue()); 12323 } else 12324 Ops.push_back(ChainVal->getOperand(i)); 12325 } 12326 } 12327 12328 if (!Ld || !ISD::isNormalLoad(Ld)) 12329 return SDValue(); 12330 12331 // If this is not the MMX case, i.e. we are just turning i64 load/store 12332 // into f64 load/store, avoid the transformation if there are multiple 12333 // uses of the loaded value. 12334 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 12335 return SDValue(); 12336 12337 DebugLoc LdDL = Ld->getDebugLoc(); 12338 DebugLoc StDL = N->getDebugLoc(); 12339 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 12340 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 12341 // pair instead. 12342 if (Subtarget->is64Bit() || F64IsLegal) { 12343 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 12344 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 12345 Ld->getPointerInfo(), Ld->isVolatile(), 12346 Ld->isNonTemporal(), Ld->getAlignment()); 12347 SDValue NewChain = NewLd.getValue(1); 12348 if (TokenFactorIndex != -1) { 12349 Ops.push_back(NewChain); 12350 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12351 Ops.size()); 12352 } 12353 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 12354 St->getPointerInfo(), 12355 St->isVolatile(), St->isNonTemporal(), 12356 St->getAlignment()); 12357 } 12358 12359 // Otherwise, lower to two pairs of 32-bit loads / stores. 12360 SDValue LoAddr = Ld->getBasePtr(); 12361 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 12362 DAG.getConstant(4, MVT::i32)); 12363 12364 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 12365 Ld->getPointerInfo(), 12366 Ld->isVolatile(), Ld->isNonTemporal(), 12367 Ld->getAlignment()); 12368 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 12369 Ld->getPointerInfo().getWithOffset(4), 12370 Ld->isVolatile(), Ld->isNonTemporal(), 12371 MinAlign(Ld->getAlignment(), 4)); 12372 12373 SDValue NewChain = LoLd.getValue(1); 12374 if (TokenFactorIndex != -1) { 12375 Ops.push_back(LoLd); 12376 Ops.push_back(HiLd); 12377 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12378 Ops.size()); 12379 } 12380 12381 LoAddr = St->getBasePtr(); 12382 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 12383 DAG.getConstant(4, MVT::i32)); 12384 12385 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 12386 St->getPointerInfo(), 12387 St->isVolatile(), St->isNonTemporal(), 12388 St->getAlignment()); 12389 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 12390 St->getPointerInfo().getWithOffset(4), 12391 St->isVolatile(), 12392 St->isNonTemporal(), 12393 MinAlign(St->getAlignment(), 4)); 12394 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 12395 } 12396 return SDValue(); 12397} 12398 12399/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 12400/// X86ISD::FXOR nodes. 12401static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 12402 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 12403 // F[X]OR(0.0, x) -> x 12404 // F[X]OR(x, 0.0) -> x 12405 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12406 if (C->getValueAPF().isPosZero()) 12407 return N->getOperand(1); 12408 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12409 if (C->getValueAPF().isPosZero()) 12410 return N->getOperand(0); 12411 return SDValue(); 12412} 12413 12414/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 12415static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 12416 // FAND(0.0, x) -> 0.0 12417 // FAND(x, 0.0) -> 0.0 12418 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12419 if (C->getValueAPF().isPosZero()) 12420 return N->getOperand(0); 12421 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12422 if (C->getValueAPF().isPosZero()) 12423 return N->getOperand(1); 12424 return SDValue(); 12425} 12426 12427static SDValue PerformBTCombine(SDNode *N, 12428 SelectionDAG &DAG, 12429 TargetLowering::DAGCombinerInfo &DCI) { 12430 // BT ignores high bits in the bit index operand. 12431 SDValue Op1 = N->getOperand(1); 12432 if (Op1.hasOneUse()) { 12433 unsigned BitWidth = Op1.getValueSizeInBits(); 12434 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 12435 APInt KnownZero, KnownOne; 12436 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 12437 !DCI.isBeforeLegalizeOps()); 12438 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12439 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 12440 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 12441 DCI.CommitTargetLoweringOpt(TLO); 12442 } 12443 return SDValue(); 12444} 12445 12446static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 12447 SDValue Op = N->getOperand(0); 12448 if (Op.getOpcode() == ISD::BITCAST) 12449 Op = Op.getOperand(0); 12450 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 12451 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 12452 VT.getVectorElementType().getSizeInBits() == 12453 OpVT.getVectorElementType().getSizeInBits()) { 12454 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 12455 } 12456 return SDValue(); 12457} 12458 12459static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 12460 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 12461 // (and (i32 x86isd::setcc_carry), 1) 12462 // This eliminates the zext. This transformation is necessary because 12463 // ISD::SETCC is always legalized to i8. 12464 DebugLoc dl = N->getDebugLoc(); 12465 SDValue N0 = N->getOperand(0); 12466 EVT VT = N->getValueType(0); 12467 if (N0.getOpcode() == ISD::AND && 12468 N0.hasOneUse() && 12469 N0.getOperand(0).hasOneUse()) { 12470 SDValue N00 = N0.getOperand(0); 12471 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 12472 return SDValue(); 12473 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 12474 if (!C || C->getZExtValue() != 1) 12475 return SDValue(); 12476 return DAG.getNode(ISD::AND, dl, VT, 12477 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 12478 N00.getOperand(0), N00.getOperand(1)), 12479 DAG.getConstant(1, VT)); 12480 } 12481 12482 return SDValue(); 12483} 12484 12485// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 12486static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 12487 unsigned X86CC = N->getConstantOperandVal(0); 12488 SDValue EFLAG = N->getOperand(1); 12489 DebugLoc DL = N->getDebugLoc(); 12490 12491 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 12492 // a zext and produces an all-ones bit which is more useful than 0/1 in some 12493 // cases. 12494 if (X86CC == X86::COND_B) 12495 return DAG.getNode(ISD::AND, DL, MVT::i8, 12496 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 12497 DAG.getConstant(X86CC, MVT::i8), EFLAG), 12498 DAG.getConstant(1, MVT::i8)); 12499 12500 return SDValue(); 12501} 12502 12503static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 12504 const X86TargetLowering *XTLI) { 12505 SDValue Op0 = N->getOperand(0); 12506 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 12507 // a 32-bit target where SSE doesn't support i64->FP operations. 12508 if (Op0.getOpcode() == ISD::LOAD) { 12509 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 12510 EVT VT = Ld->getValueType(0); 12511 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 12512 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 12513 !XTLI->getSubtarget()->is64Bit() && 12514 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12515 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 12516 Ld->getChain(), Op0, DAG); 12517 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 12518 return FILDChain; 12519 } 12520 } 12521 return SDValue(); 12522} 12523 12524// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 12525static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 12526 X86TargetLowering::DAGCombinerInfo &DCI) { 12527 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 12528 // the result is either zero or one (depending on the input carry bit). 12529 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 12530 if (X86::isZeroNode(N->getOperand(0)) && 12531 X86::isZeroNode(N->getOperand(1)) && 12532 // We don't have a good way to replace an EFLAGS use, so only do this when 12533 // dead right now. 12534 SDValue(N, 1).use_empty()) { 12535 DebugLoc DL = N->getDebugLoc(); 12536 EVT VT = N->getValueType(0); 12537 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 12538 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 12539 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 12540 DAG.getConstant(X86::COND_B,MVT::i8), 12541 N->getOperand(2)), 12542 DAG.getConstant(1, VT)); 12543 return DCI.CombineTo(N, Res1, CarryOut); 12544 } 12545 12546 return SDValue(); 12547} 12548 12549// fold (add Y, (sete X, 0)) -> adc 0, Y 12550// (add Y, (setne X, 0)) -> sbb -1, Y 12551// (sub (sete X, 0), Y) -> sbb 0, Y 12552// (sub (setne X, 0), Y) -> adc -1, Y 12553static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 12554 DebugLoc DL = N->getDebugLoc(); 12555 12556 // Look through ZExts. 12557 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 12558 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 12559 return SDValue(); 12560 12561 SDValue SetCC = Ext.getOperand(0); 12562 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 12563 return SDValue(); 12564 12565 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 12566 if (CC != X86::COND_E && CC != X86::COND_NE) 12567 return SDValue(); 12568 12569 SDValue Cmp = SetCC.getOperand(1); 12570 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 12571 !X86::isZeroNode(Cmp.getOperand(1)) || 12572 !Cmp.getOperand(0).getValueType().isInteger()) 12573 return SDValue(); 12574 12575 SDValue CmpOp0 = Cmp.getOperand(0); 12576 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 12577 DAG.getConstant(1, CmpOp0.getValueType())); 12578 12579 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 12580 if (CC == X86::COND_NE) 12581 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 12582 DL, OtherVal.getValueType(), OtherVal, 12583 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 12584 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 12585 DL, OtherVal.getValueType(), OtherVal, 12586 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 12587} 12588 12589static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { 12590 SDValue Op0 = N->getOperand(0); 12591 SDValue Op1 = N->getOperand(1); 12592 12593 // X86 can't encode an immediate LHS of a sub. See if we can push the 12594 // negation into a preceding instruction. 12595 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 12596 uint64_t Op0C = C->getSExtValue(); 12597 12598 // If the RHS of the sub is a XOR with one use and a constant, invert the 12599 // immediate. Then add one to the LHS of the sub so we can turn 12600 // X-Y -> X+~Y+1, saving one register. 12601 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 12602 isa<ConstantSDNode>(Op1.getOperand(1))) { 12603 uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue(); 12604 EVT VT = Op0.getValueType(); 12605 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 12606 Op1.getOperand(0), 12607 DAG.getConstant(~XorC, VT)); 12608 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 12609 DAG.getConstant(Op0C+1, VT)); 12610 } 12611 } 12612 12613 return OptimizeConditionalInDecrement(N, DAG); 12614} 12615 12616SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 12617 DAGCombinerInfo &DCI) const { 12618 SelectionDAG &DAG = DCI.DAG; 12619 switch (N->getOpcode()) { 12620 default: break; 12621 case ISD::EXTRACT_VECTOR_ELT: 12622 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 12623 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 12624 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 12625 case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); 12626 case ISD::SUB: return PerformSubCombine(N, DAG); 12627 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 12628 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 12629 case ISD::SHL: 12630 case ISD::SRA: 12631 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 12632 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 12633 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 12634 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 12635 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 12636 case X86ISD::FXOR: 12637 case X86ISD::FOR: return PerformFORCombine(N, DAG); 12638 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 12639 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 12640 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 12641 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 12642 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 12643 case X86ISD::SHUFPS: // Handle all target specific shuffles 12644 case X86ISD::SHUFPD: 12645 case X86ISD::PALIGN: 12646 case X86ISD::PUNPCKHBW: 12647 case X86ISD::PUNPCKHWD: 12648 case X86ISD::PUNPCKHDQ: 12649 case X86ISD::PUNPCKHQDQ: 12650 case X86ISD::UNPCKHPS: 12651 case X86ISD::UNPCKHPD: 12652 case X86ISD::VUNPCKHPSY: 12653 case X86ISD::VUNPCKHPDY: 12654 case X86ISD::PUNPCKLBW: 12655 case X86ISD::PUNPCKLWD: 12656 case X86ISD::PUNPCKLDQ: 12657 case X86ISD::PUNPCKLQDQ: 12658 case X86ISD::UNPCKLPS: 12659 case X86ISD::UNPCKLPD: 12660 case X86ISD::VUNPCKLPSY: 12661 case X86ISD::VUNPCKLPDY: 12662 case X86ISD::MOVHLPS: 12663 case X86ISD::MOVLHPS: 12664 case X86ISD::PSHUFD: 12665 case X86ISD::PSHUFHW: 12666 case X86ISD::PSHUFLW: 12667 case X86ISD::MOVSS: 12668 case X86ISD::MOVSD: 12669 case X86ISD::VPERMIL: 12670 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 12671 } 12672 12673 return SDValue(); 12674} 12675 12676/// isTypeDesirableForOp - Return true if the target has native support for 12677/// the specified value type and it is 'desirable' to use the type for the 12678/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 12679/// instruction encodings are longer and some i16 instructions are slow. 12680bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 12681 if (!isTypeLegal(VT)) 12682 return false; 12683 if (VT != MVT::i16) 12684 return true; 12685 12686 switch (Opc) { 12687 default: 12688 return true; 12689 case ISD::LOAD: 12690 case ISD::SIGN_EXTEND: 12691 case ISD::ZERO_EXTEND: 12692 case ISD::ANY_EXTEND: 12693 case ISD::SHL: 12694 case ISD::SRL: 12695 case ISD::SUB: 12696 case ISD::ADD: 12697 case ISD::MUL: 12698 case ISD::AND: 12699 case ISD::OR: 12700 case ISD::XOR: 12701 return false; 12702 } 12703} 12704 12705/// IsDesirableToPromoteOp - This method query the target whether it is 12706/// beneficial for dag combiner to promote the specified node. If true, it 12707/// should return the desired promotion type by reference. 12708bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 12709 EVT VT = Op.getValueType(); 12710 if (VT != MVT::i16) 12711 return false; 12712 12713 bool Promote = false; 12714 bool Commute = false; 12715 switch (Op.getOpcode()) { 12716 default: break; 12717 case ISD::LOAD: { 12718 LoadSDNode *LD = cast<LoadSDNode>(Op); 12719 // If the non-extending load has a single use and it's not live out, then it 12720 // might be folded. 12721 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 12722 Op.hasOneUse()*/) { 12723 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12724 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 12725 // The only case where we'd want to promote LOAD (rather then it being 12726 // promoted as an operand is when it's only use is liveout. 12727 if (UI->getOpcode() != ISD::CopyToReg) 12728 return false; 12729 } 12730 } 12731 Promote = true; 12732 break; 12733 } 12734 case ISD::SIGN_EXTEND: 12735 case ISD::ZERO_EXTEND: 12736 case ISD::ANY_EXTEND: 12737 Promote = true; 12738 break; 12739 case ISD::SHL: 12740 case ISD::SRL: { 12741 SDValue N0 = Op.getOperand(0); 12742 // Look out for (store (shl (load), x)). 12743 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 12744 return false; 12745 Promote = true; 12746 break; 12747 } 12748 case ISD::ADD: 12749 case ISD::MUL: 12750 case ISD::AND: 12751 case ISD::OR: 12752 case ISD::XOR: 12753 Commute = true; 12754 // fallthrough 12755 case ISD::SUB: { 12756 SDValue N0 = Op.getOperand(0); 12757 SDValue N1 = Op.getOperand(1); 12758 if (!Commute && MayFoldLoad(N1)) 12759 return false; 12760 // Avoid disabling potential load folding opportunities. 12761 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 12762 return false; 12763 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 12764 return false; 12765 Promote = true; 12766 } 12767 } 12768 12769 PVT = MVT::i32; 12770 return Promote; 12771} 12772 12773//===----------------------------------------------------------------------===// 12774// X86 Inline Assembly Support 12775//===----------------------------------------------------------------------===// 12776 12777bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 12778 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 12779 12780 std::string AsmStr = IA->getAsmString(); 12781 12782 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 12783 SmallVector<StringRef, 4> AsmPieces; 12784 SplitString(AsmStr, AsmPieces, ";\n"); 12785 12786 switch (AsmPieces.size()) { 12787 default: return false; 12788 case 1: 12789 AsmStr = AsmPieces[0]; 12790 AsmPieces.clear(); 12791 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 12792 12793 // FIXME: this should verify that we are targeting a 486 or better. If not, 12794 // we will turn this bswap into something that will be lowered to logical ops 12795 // instead of emitting the bswap asm. For now, we don't support 486 or lower 12796 // so don't worry about this. 12797 // bswap $0 12798 if (AsmPieces.size() == 2 && 12799 (AsmPieces[0] == "bswap" || 12800 AsmPieces[0] == "bswapq" || 12801 AsmPieces[0] == "bswapl") && 12802 (AsmPieces[1] == "$0" || 12803 AsmPieces[1] == "${0:q}")) { 12804 // No need to check constraints, nothing other than the equivalent of 12805 // "=r,0" would be valid here. 12806 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12807 if (!Ty || Ty->getBitWidth() % 16 != 0) 12808 return false; 12809 return IntrinsicLowering::LowerToByteSwap(CI); 12810 } 12811 // rorw $$8, ${0:w} --> llvm.bswap.i16 12812 if (CI->getType()->isIntegerTy(16) && 12813 AsmPieces.size() == 3 && 12814 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 12815 AsmPieces[1] == "$$8," && 12816 AsmPieces[2] == "${0:w}" && 12817 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12818 AsmPieces.clear(); 12819 const std::string &ConstraintsStr = IA->getConstraintString(); 12820 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12821 std::sort(AsmPieces.begin(), AsmPieces.end()); 12822 if (AsmPieces.size() == 4 && 12823 AsmPieces[0] == "~{cc}" && 12824 AsmPieces[1] == "~{dirflag}" && 12825 AsmPieces[2] == "~{flags}" && 12826 AsmPieces[3] == "~{fpsr}") { 12827 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12828 if (!Ty || Ty->getBitWidth() % 16 != 0) 12829 return false; 12830 return IntrinsicLowering::LowerToByteSwap(CI); 12831 } 12832 } 12833 break; 12834 case 3: 12835 if (CI->getType()->isIntegerTy(32) && 12836 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12837 SmallVector<StringRef, 4> Words; 12838 SplitString(AsmPieces[0], Words, " \t,"); 12839 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12840 Words[2] == "${0:w}") { 12841 Words.clear(); 12842 SplitString(AsmPieces[1], Words, " \t,"); 12843 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 12844 Words[2] == "$0") { 12845 Words.clear(); 12846 SplitString(AsmPieces[2], Words, " \t,"); 12847 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12848 Words[2] == "${0:w}") { 12849 AsmPieces.clear(); 12850 const std::string &ConstraintsStr = IA->getConstraintString(); 12851 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12852 std::sort(AsmPieces.begin(), AsmPieces.end()); 12853 if (AsmPieces.size() == 4 && 12854 AsmPieces[0] == "~{cc}" && 12855 AsmPieces[1] == "~{dirflag}" && 12856 AsmPieces[2] == "~{flags}" && 12857 AsmPieces[3] == "~{fpsr}") { 12858 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12859 if (!Ty || Ty->getBitWidth() % 16 != 0) 12860 return false; 12861 return IntrinsicLowering::LowerToByteSwap(CI); 12862 } 12863 } 12864 } 12865 } 12866 } 12867 12868 if (CI->getType()->isIntegerTy(64)) { 12869 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 12870 if (Constraints.size() >= 2 && 12871 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 12872 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 12873 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 12874 SmallVector<StringRef, 4> Words; 12875 SplitString(AsmPieces[0], Words, " \t"); 12876 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 12877 Words.clear(); 12878 SplitString(AsmPieces[1], Words, " \t"); 12879 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 12880 Words.clear(); 12881 SplitString(AsmPieces[2], Words, " \t,"); 12882 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 12883 Words[2] == "%edx") { 12884 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12885 if (!Ty || Ty->getBitWidth() % 16 != 0) 12886 return false; 12887 return IntrinsicLowering::LowerToByteSwap(CI); 12888 } 12889 } 12890 } 12891 } 12892 } 12893 break; 12894 } 12895 return false; 12896} 12897 12898 12899 12900/// getConstraintType - Given a constraint letter, return the type of 12901/// constraint it is for this target. 12902X86TargetLowering::ConstraintType 12903X86TargetLowering::getConstraintType(const std::string &Constraint) const { 12904 if (Constraint.size() == 1) { 12905 switch (Constraint[0]) { 12906 case 'R': 12907 case 'q': 12908 case 'Q': 12909 case 'f': 12910 case 't': 12911 case 'u': 12912 case 'y': 12913 case 'x': 12914 case 'Y': 12915 case 'l': 12916 return C_RegisterClass; 12917 case 'a': 12918 case 'b': 12919 case 'c': 12920 case 'd': 12921 case 'S': 12922 case 'D': 12923 case 'A': 12924 return C_Register; 12925 case 'I': 12926 case 'J': 12927 case 'K': 12928 case 'L': 12929 case 'M': 12930 case 'N': 12931 case 'G': 12932 case 'C': 12933 case 'e': 12934 case 'Z': 12935 return C_Other; 12936 default: 12937 break; 12938 } 12939 } 12940 return TargetLowering::getConstraintType(Constraint); 12941} 12942 12943/// Examine constraint type and operand type and determine a weight value. 12944/// This object must already have been set up with the operand type 12945/// and the current alternative constraint selected. 12946TargetLowering::ConstraintWeight 12947 X86TargetLowering::getSingleConstraintMatchWeight( 12948 AsmOperandInfo &info, const char *constraint) const { 12949 ConstraintWeight weight = CW_Invalid; 12950 Value *CallOperandVal = info.CallOperandVal; 12951 // If we don't have a value, we can't do a match, 12952 // but allow it at the lowest weight. 12953 if (CallOperandVal == NULL) 12954 return CW_Default; 12955 Type *type = CallOperandVal->getType(); 12956 // Look at the constraint type. 12957 switch (*constraint) { 12958 default: 12959 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12960 case 'R': 12961 case 'q': 12962 case 'Q': 12963 case 'a': 12964 case 'b': 12965 case 'c': 12966 case 'd': 12967 case 'S': 12968 case 'D': 12969 case 'A': 12970 if (CallOperandVal->getType()->isIntegerTy()) 12971 weight = CW_SpecificReg; 12972 break; 12973 case 'f': 12974 case 't': 12975 case 'u': 12976 if (type->isFloatingPointTy()) 12977 weight = CW_SpecificReg; 12978 break; 12979 case 'y': 12980 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 12981 weight = CW_SpecificReg; 12982 break; 12983 case 'x': 12984 case 'Y': 12985 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 12986 weight = CW_Register; 12987 break; 12988 case 'I': 12989 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 12990 if (C->getZExtValue() <= 31) 12991 weight = CW_Constant; 12992 } 12993 break; 12994 case 'J': 12995 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12996 if (C->getZExtValue() <= 63) 12997 weight = CW_Constant; 12998 } 12999 break; 13000 case 'K': 13001 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13002 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 13003 weight = CW_Constant; 13004 } 13005 break; 13006 case 'L': 13007 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13008 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 13009 weight = CW_Constant; 13010 } 13011 break; 13012 case 'M': 13013 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13014 if (C->getZExtValue() <= 3) 13015 weight = CW_Constant; 13016 } 13017 break; 13018 case 'N': 13019 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13020 if (C->getZExtValue() <= 0xff) 13021 weight = CW_Constant; 13022 } 13023 break; 13024 case 'G': 13025 case 'C': 13026 if (dyn_cast<ConstantFP>(CallOperandVal)) { 13027 weight = CW_Constant; 13028 } 13029 break; 13030 case 'e': 13031 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13032 if ((C->getSExtValue() >= -0x80000000LL) && 13033 (C->getSExtValue() <= 0x7fffffffLL)) 13034 weight = CW_Constant; 13035 } 13036 break; 13037 case 'Z': 13038 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13039 if (C->getZExtValue() <= 0xffffffff) 13040 weight = CW_Constant; 13041 } 13042 break; 13043 } 13044 return weight; 13045} 13046 13047/// LowerXConstraint - try to replace an X constraint, which matches anything, 13048/// with another that has more specific requirements based on the type of the 13049/// corresponding operand. 13050const char *X86TargetLowering:: 13051LowerXConstraint(EVT ConstraintVT) const { 13052 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 13053 // 'f' like normal targets. 13054 if (ConstraintVT.isFloatingPoint()) { 13055 if (Subtarget->hasXMMInt()) 13056 return "Y"; 13057 if (Subtarget->hasXMM()) 13058 return "x"; 13059 } 13060 13061 return TargetLowering::LowerXConstraint(ConstraintVT); 13062} 13063 13064/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13065/// vector. If it is invalid, don't add anything to Ops. 13066void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13067 std::string &Constraint, 13068 std::vector<SDValue>&Ops, 13069 SelectionDAG &DAG) const { 13070 SDValue Result(0, 0); 13071 13072 // Only support length 1 constraints for now. 13073 if (Constraint.length() > 1) return; 13074 13075 char ConstraintLetter = Constraint[0]; 13076 switch (ConstraintLetter) { 13077 default: break; 13078 case 'I': 13079 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13080 if (C->getZExtValue() <= 31) { 13081 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13082 break; 13083 } 13084 } 13085 return; 13086 case 'J': 13087 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13088 if (C->getZExtValue() <= 63) { 13089 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13090 break; 13091 } 13092 } 13093 return; 13094 case 'K': 13095 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13096 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 13097 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13098 break; 13099 } 13100 } 13101 return; 13102 case 'N': 13103 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13104 if (C->getZExtValue() <= 255) { 13105 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13106 break; 13107 } 13108 } 13109 return; 13110 case 'e': { 13111 // 32-bit signed value 13112 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13113 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13114 C->getSExtValue())) { 13115 // Widen to 64 bits here to get it sign extended. 13116 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 13117 break; 13118 } 13119 // FIXME gcc accepts some relocatable values here too, but only in certain 13120 // memory models; it's complicated. 13121 } 13122 return; 13123 } 13124 case 'Z': { 13125 // 32-bit unsigned value 13126 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13127 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13128 C->getZExtValue())) { 13129 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13130 break; 13131 } 13132 } 13133 // FIXME gcc accepts some relocatable values here too, but only in certain 13134 // memory models; it's complicated. 13135 return; 13136 } 13137 case 'i': { 13138 // Literal immediates are always ok. 13139 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 13140 // Widen to 64 bits here to get it sign extended. 13141 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 13142 break; 13143 } 13144 13145 // In any sort of PIC mode addresses need to be computed at runtime by 13146 // adding in a register or some sort of table lookup. These can't 13147 // be used as immediates. 13148 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 13149 return; 13150 13151 // If we are in non-pic codegen mode, we allow the address of a global (with 13152 // an optional displacement) to be used with 'i'. 13153 GlobalAddressSDNode *GA = 0; 13154 int64_t Offset = 0; 13155 13156 // Match either (GA), (GA+C), (GA+C1+C2), etc. 13157 while (1) { 13158 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 13159 Offset += GA->getOffset(); 13160 break; 13161 } else if (Op.getOpcode() == ISD::ADD) { 13162 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13163 Offset += C->getZExtValue(); 13164 Op = Op.getOperand(0); 13165 continue; 13166 } 13167 } else if (Op.getOpcode() == ISD::SUB) { 13168 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13169 Offset += -C->getZExtValue(); 13170 Op = Op.getOperand(0); 13171 continue; 13172 } 13173 } 13174 13175 // Otherwise, this isn't something we can handle, reject it. 13176 return; 13177 } 13178 13179 const GlobalValue *GV = GA->getGlobal(); 13180 // If we require an extra load to get this address, as in PIC mode, we 13181 // can't accept it. 13182 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 13183 getTargetMachine()))) 13184 return; 13185 13186 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 13187 GA->getValueType(0), Offset); 13188 break; 13189 } 13190 } 13191 13192 if (Result.getNode()) { 13193 Ops.push_back(Result); 13194 return; 13195 } 13196 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13197} 13198 13199std::pair<unsigned, const TargetRegisterClass*> 13200X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 13201 EVT VT) const { 13202 // First, see if this is a constraint that directly corresponds to an LLVM 13203 // register class. 13204 if (Constraint.size() == 1) { 13205 // GCC Constraint Letters 13206 switch (Constraint[0]) { 13207 default: break; 13208 // TODO: Slight differences here in allocation order and leaving 13209 // RIP in the class. Do they matter any more here than they do 13210 // in the normal allocation? 13211 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 13212 if (Subtarget->is64Bit()) { 13213 if (VT == MVT::i32 || VT == MVT::f32) 13214 return std::make_pair(0U, X86::GR32RegisterClass); 13215 else if (VT == MVT::i16) 13216 return std::make_pair(0U, X86::GR16RegisterClass); 13217 else if (VT == MVT::i8 || VT == MVT::i1) 13218 return std::make_pair(0U, X86::GR8RegisterClass); 13219 else if (VT == MVT::i64 || VT == MVT::f64) 13220 return std::make_pair(0U, X86::GR64RegisterClass); 13221 break; 13222 } 13223 // 32-bit fallthrough 13224 case 'Q': // Q_REGS 13225 if (VT == MVT::i32 || VT == MVT::f32) 13226 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 13227 else if (VT == MVT::i16) 13228 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 13229 else if (VT == MVT::i8 || VT == MVT::i1) 13230 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 13231 else if (VT == MVT::i64) 13232 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 13233 break; 13234 case 'r': // GENERAL_REGS 13235 case 'l': // INDEX_REGS 13236 if (VT == MVT::i8 || VT == MVT::i1) 13237 return std::make_pair(0U, X86::GR8RegisterClass); 13238 if (VT == MVT::i16) 13239 return std::make_pair(0U, X86::GR16RegisterClass); 13240 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 13241 return std::make_pair(0U, X86::GR32RegisterClass); 13242 return std::make_pair(0U, X86::GR64RegisterClass); 13243 case 'R': // LEGACY_REGS 13244 if (VT == MVT::i8 || VT == MVT::i1) 13245 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 13246 if (VT == MVT::i16) 13247 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 13248 if (VT == MVT::i32 || !Subtarget->is64Bit()) 13249 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 13250 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 13251 case 'f': // FP Stack registers. 13252 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 13253 // value to the correct fpstack register class. 13254 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 13255 return std::make_pair(0U, X86::RFP32RegisterClass); 13256 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 13257 return std::make_pair(0U, X86::RFP64RegisterClass); 13258 return std::make_pair(0U, X86::RFP80RegisterClass); 13259 case 'y': // MMX_REGS if MMX allowed. 13260 if (!Subtarget->hasMMX()) break; 13261 return std::make_pair(0U, X86::VR64RegisterClass); 13262 case 'Y': // SSE_REGS if SSE2 allowed 13263 if (!Subtarget->hasXMMInt()) break; 13264 // FALL THROUGH. 13265 case 'x': // SSE_REGS if SSE1 allowed 13266 if (!Subtarget->hasXMM()) break; 13267 13268 switch (VT.getSimpleVT().SimpleTy) { 13269 default: break; 13270 // Scalar SSE types. 13271 case MVT::f32: 13272 case MVT::i32: 13273 return std::make_pair(0U, X86::FR32RegisterClass); 13274 case MVT::f64: 13275 case MVT::i64: 13276 return std::make_pair(0U, X86::FR64RegisterClass); 13277 // Vector types. 13278 case MVT::v16i8: 13279 case MVT::v8i16: 13280 case MVT::v4i32: 13281 case MVT::v2i64: 13282 case MVT::v4f32: 13283 case MVT::v2f64: 13284 return std::make_pair(0U, X86::VR128RegisterClass); 13285 } 13286 break; 13287 } 13288 } 13289 13290 // Use the default implementation in TargetLowering to convert the register 13291 // constraint into a member of a register class. 13292 std::pair<unsigned, const TargetRegisterClass*> Res; 13293 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 13294 13295 // Not found as a standard register? 13296 if (Res.second == 0) { 13297 // Map st(0) -> st(7) -> ST0 13298 if (Constraint.size() == 7 && Constraint[0] == '{' && 13299 tolower(Constraint[1]) == 's' && 13300 tolower(Constraint[2]) == 't' && 13301 Constraint[3] == '(' && 13302 (Constraint[4] >= '0' && Constraint[4] <= '7') && 13303 Constraint[5] == ')' && 13304 Constraint[6] == '}') { 13305 13306 Res.first = X86::ST0+Constraint[4]-'0'; 13307 Res.second = X86::RFP80RegisterClass; 13308 return Res; 13309 } 13310 13311 // GCC allows "st(0)" to be called just plain "st". 13312 if (StringRef("{st}").equals_lower(Constraint)) { 13313 Res.first = X86::ST0; 13314 Res.second = X86::RFP80RegisterClass; 13315 return Res; 13316 } 13317 13318 // flags -> EFLAGS 13319 if (StringRef("{flags}").equals_lower(Constraint)) { 13320 Res.first = X86::EFLAGS; 13321 Res.second = X86::CCRRegisterClass; 13322 return Res; 13323 } 13324 13325 // 'A' means EAX + EDX. 13326 if (Constraint == "A") { 13327 Res.first = X86::EAX; 13328 Res.second = X86::GR32_ADRegisterClass; 13329 return Res; 13330 } 13331 return Res; 13332 } 13333 13334 // Otherwise, check to see if this is a register class of the wrong value 13335 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 13336 // turn into {ax},{dx}. 13337 if (Res.second->hasType(VT)) 13338 return Res; // Correct type already, nothing to do. 13339 13340 // All of the single-register GCC register classes map their values onto 13341 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 13342 // really want an 8-bit or 32-bit register, map to the appropriate register 13343 // class and return the appropriate register. 13344 if (Res.second == X86::GR16RegisterClass) { 13345 if (VT == MVT::i8) { 13346 unsigned DestReg = 0; 13347 switch (Res.first) { 13348 default: break; 13349 case X86::AX: DestReg = X86::AL; break; 13350 case X86::DX: DestReg = X86::DL; break; 13351 case X86::CX: DestReg = X86::CL; break; 13352 case X86::BX: DestReg = X86::BL; break; 13353 } 13354 if (DestReg) { 13355 Res.first = DestReg; 13356 Res.second = X86::GR8RegisterClass; 13357 } 13358 } else if (VT == MVT::i32) { 13359 unsigned DestReg = 0; 13360 switch (Res.first) { 13361 default: break; 13362 case X86::AX: DestReg = X86::EAX; break; 13363 case X86::DX: DestReg = X86::EDX; break; 13364 case X86::CX: DestReg = X86::ECX; break; 13365 case X86::BX: DestReg = X86::EBX; break; 13366 case X86::SI: DestReg = X86::ESI; break; 13367 case X86::DI: DestReg = X86::EDI; break; 13368 case X86::BP: DestReg = X86::EBP; break; 13369 case X86::SP: DestReg = X86::ESP; break; 13370 } 13371 if (DestReg) { 13372 Res.first = DestReg; 13373 Res.second = X86::GR32RegisterClass; 13374 } 13375 } else if (VT == MVT::i64) { 13376 unsigned DestReg = 0; 13377 switch (Res.first) { 13378 default: break; 13379 case X86::AX: DestReg = X86::RAX; break; 13380 case X86::DX: DestReg = X86::RDX; break; 13381 case X86::CX: DestReg = X86::RCX; break; 13382 case X86::BX: DestReg = X86::RBX; break; 13383 case X86::SI: DestReg = X86::RSI; break; 13384 case X86::DI: DestReg = X86::RDI; break; 13385 case X86::BP: DestReg = X86::RBP; break; 13386 case X86::SP: DestReg = X86::RSP; break; 13387 } 13388 if (DestReg) { 13389 Res.first = DestReg; 13390 Res.second = X86::GR64RegisterClass; 13391 } 13392 } 13393 } else if (Res.second == X86::FR32RegisterClass || 13394 Res.second == X86::FR64RegisterClass || 13395 Res.second == X86::VR128RegisterClass) { 13396 // Handle references to XMM physical registers that got mapped into the 13397 // wrong class. This can happen with constraints like {xmm0} where the 13398 // target independent register mapper will just pick the first match it can 13399 // find, ignoring the required type. 13400 if (VT == MVT::f32) 13401 Res.second = X86::FR32RegisterClass; 13402 else if (VT == MVT::f64) 13403 Res.second = X86::FR64RegisterClass; 13404 else if (X86::VR128RegisterClass->hasType(VT)) 13405 Res.second = X86::VR128RegisterClass; 13406 } 13407 13408 return Res; 13409} 13410