X86ISelLowering.cpp revision 2753ae314f656eab6d42c918469ce4ebf422cee5
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 75/// sets things up to match to an AVX VEXTRACTF128 instruction or a 76/// simple subregister reference. Idx is an index in the 128 bits we 77/// want. It need not be aligned to a 128-bit bounday. That makes 78/// lowering EXTRACT_VECTOR_ELT operations easier. 79static SDValue Extract128BitVector(SDValue Vec, 80 SDValue Idx, 81 SelectionDAG &DAG, 82 DebugLoc dl) { 83 EVT VT = Vec.getValueType(); 84 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 85 EVT ElVT = VT.getVectorElementType(); 86 int Factor = VT.getSizeInBits()/128; 87 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 88 VT.getVectorNumElements()/Factor); 89 90 // Extract from UNDEF is UNDEF. 91 if (Vec.getOpcode() == ISD::UNDEF) 92 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 93 94 if (isa<ConstantSDNode>(Idx)) { 95 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 96 97 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 98 // we can match to VEXTRACTF128. 99 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 100 101 // This is the index of the first element of the 128-bit chunk 102 // we want. 103 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 104 * ElemsPerChunk); 105 106 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 108 VecIdx); 109 110 return Result; 111 } 112 113 return SDValue(); 114} 115 116/// Generate a DAG to put 128-bits into a vector > 128 bits. This 117/// sets things up to match to an AVX VINSERTF128 instruction or a 118/// simple superregister reference. Idx is an index in the 128 bits 119/// we want. It need not be aligned to a 128-bit bounday. That makes 120/// lowering INSERT_VECTOR_ELT operations easier. 121static SDValue Insert128BitVector(SDValue Result, 122 SDValue Vec, 123 SDValue Idx, 124 SelectionDAG &DAG, 125 DebugLoc dl) { 126 if (isa<ConstantSDNode>(Idx)) { 127 EVT VT = Vec.getValueType(); 128 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 129 130 EVT ElVT = VT.getVectorElementType(); 131 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 132 EVT ResultVT = Result.getValueType(); 133 134 // Insert the relevant 128 bits. 135 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 136 137 // This is the index of the first element of the 128-bit chunk 138 // we want. 139 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 140 * ElemsPerChunk); 141 142 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 143 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 144 VecIdx); 145 return Result; 146 } 147 148 return SDValue(); 149} 150 151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 152 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 153 bool is64Bit = Subtarget->is64Bit(); 154 155 if (Subtarget->isTargetEnvMacho()) { 156 if (is64Bit) 157 return new X8664_MachoTargetObjectFile(); 158 return new TargetLoweringObjectFileMachO(); 159 } 160 161 if (Subtarget->isTargetELF()) 162 return new TargetLoweringObjectFileELF(); 163 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 164 return new TargetLoweringObjectFileCOFF(); 165 llvm_unreachable("unknown subtarget type"); 166} 167 168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 169 : TargetLowering(TM, createTLOF(TM)) { 170 Subtarget = &TM.getSubtarget<X86Subtarget>(); 171 X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX(); 172 X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX(); 173 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 174 175 RegInfo = TM.getRegisterInfo(); 176 TD = getTargetData(); 177 178 // Set up the TargetLowering object. 179 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 180 181 // X86 is weird, it always uses i8 for shift amounts and setcc results. 182 setBooleanContents(ZeroOrOneBooleanContent); 183 184 // For 64-bit since we have so many registers use the ILP scheduler, for 185 // 32-bit code use the register pressure specific scheduling. 186 if (Subtarget->is64Bit()) 187 setSchedulingPreference(Sched::ILP); 188 else 189 setSchedulingPreference(Sched::RegPressure); 190 setStackPointerRegisterToSaveRestore(X86StackPtr); 191 192 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 193 // Setup Windows compiler runtime calls. 194 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 195 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 196 setLibcallName(RTLIB::SREM_I64, "_allrem"); 197 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 198 setLibcallName(RTLIB::MUL_I64, "_allmul"); 199 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 200 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 201 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 202 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 203 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 207 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 225 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 226 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 257 } else if (!UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!UseSoftFloat) { 315 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 327 if (!X86ScalarSSEf64) { 328 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 329 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 330 if (Subtarget->is64Bit()) { 331 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 332 // Without SSE, i64->f64 goes through memory. 333 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 334 } 335 } 336 337 // Scalar integer divide and remainder are lowered to use operations that 338 // produce two results, to match the available instructions. This exposes 339 // the two-result form to trivial CSE, which is able to combine x/y and x%y 340 // into a single instruction. 341 // 342 // Scalar integer multiply-high is also lowered to use two-result 343 // operations, to match the available instructions. However, plain multiply 344 // (low) operations are left as Legal, as there are single-result 345 // instructions for this in x86. Using the two-result multiply instructions 346 // when both high and low results are needed must be arranged by dagcombine. 347 for (unsigned i = 0, e = 4; i != e; ++i) { 348 MVT VT = IntVTs[i]; 349 setOperationAction(ISD::MULHS, VT, Expand); 350 setOperationAction(ISD::MULHU, VT, Expand); 351 setOperationAction(ISD::SDIV, VT, Expand); 352 setOperationAction(ISD::UDIV, VT, Expand); 353 setOperationAction(ISD::SREM, VT, Expand); 354 setOperationAction(ISD::UREM, VT, Expand); 355 356 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 357 setOperationAction(ISD::ADDC, VT, Custom); 358 setOperationAction(ISD::ADDE, VT, Custom); 359 setOperationAction(ISD::SUBC, VT, Custom); 360 setOperationAction(ISD::SUBE, VT, Custom); 361 } 362 363 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 364 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 365 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 366 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 367 if (Subtarget->is64Bit()) 368 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 372 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 373 setOperationAction(ISD::FREM , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f64 , Expand); 375 setOperationAction(ISD::FREM , MVT::f80 , Expand); 376 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 377 378 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 379 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 380 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 381 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 382 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 383 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 384 if (Subtarget->is64Bit()) { 385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 386 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 387 } 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 401 402 // These should be promoted to a larger select which is supported. 403 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 404 // X86 wants to expand cmov itself. 405 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 406 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 407 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 408 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 409 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 411 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 412 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 414 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 415 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 419 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 420 } 421 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 422 423 // Darwin ABI issue. 424 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 425 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 426 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 427 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 430 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 431 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 432 if (Subtarget->is64Bit()) { 433 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 434 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 435 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 436 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 437 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 438 } 439 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 440 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 441 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 442 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 443 if (Subtarget->is64Bit()) { 444 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 445 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 446 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 447 } 448 449 if (Subtarget->hasXMM()) 450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 451 452 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 453 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 454 455 // On X86 and X86-64, atomic operations are lowered to locked instructions. 456 // Locked instructions, in turn, have implicit fence semantics (all memory 457 // operations are flushed before issuing the locked instruction, and they 458 // are not buffered), so we can fold away the common pattern of 459 // fence-atomic-fence. 460 setShouldFoldAtomicFences(true); 461 462 // Expand certain atomics 463 for (unsigned i = 0, e = 4; i != e; ++i) { 464 MVT VT = IntVTs[i]; 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 466 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 467 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 468 } 469 470 if (!Subtarget->is64Bit()) { 471 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 472 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 473 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 474 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 475 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 476 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 477 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 478 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 479 } 480 481 if (Subtarget->hasCmpxchg16b()) { 482 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 483 } 484 485 // FIXME - use subtarget debug flags 486 if (!Subtarget->isTargetDarwin() && 487 !Subtarget->isTargetELF() && 488 !Subtarget->isTargetCygMing()) { 489 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 490 } 491 492 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 493 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 494 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 495 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 496 if (Subtarget->is64Bit()) { 497 setExceptionPointerRegister(X86::RAX); 498 setExceptionSelectorRegister(X86::RDX); 499 } else { 500 setExceptionPointerRegister(X86::EAX); 501 setExceptionSelectorRegister(X86::EDX); 502 } 503 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 504 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 505 506 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 507 508 setOperationAction(ISD::TRAP, MVT::Other, Legal); 509 510 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 511 setOperationAction(ISD::VASTART , MVT::Other, Custom); 512 setOperationAction(ISD::VAEND , MVT::Other, Expand); 513 if (Subtarget->is64Bit()) { 514 setOperationAction(ISD::VAARG , MVT::Other, Custom); 515 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 516 } else { 517 setOperationAction(ISD::VAARG , MVT::Other, Expand); 518 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 519 } 520 521 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 522 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 523 setOperationAction(ISD::DYNAMIC_STACKALLOC, 524 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 525 (Subtarget->isTargetCOFF() 526 && !Subtarget->isTargetEnvMacho() 527 ? Custom : Expand)); 528 529 if (!UseSoftFloat && X86ScalarSSEf64) { 530 // f32 and f64 use SSE. 531 // Set up the FP register classes. 532 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 533 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 534 535 // Use ANDPD to simulate FABS. 536 setOperationAction(ISD::FABS , MVT::f64, Custom); 537 setOperationAction(ISD::FABS , MVT::f32, Custom); 538 539 // Use XORP to simulate FNEG. 540 setOperationAction(ISD::FNEG , MVT::f64, Custom); 541 setOperationAction(ISD::FNEG , MVT::f32, Custom); 542 543 // Use ANDPD and ORPD to simulate FCOPYSIGN. 544 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 545 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 546 547 // Lower this to FGETSIGNx86 plus an AND. 548 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 549 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 550 551 // We don't support sin/cos/fmod 552 setOperationAction(ISD::FSIN , MVT::f64, Expand); 553 setOperationAction(ISD::FCOS , MVT::f64, Expand); 554 setOperationAction(ISD::FSIN , MVT::f32, Expand); 555 setOperationAction(ISD::FCOS , MVT::f32, Expand); 556 557 // Expand FP immediates into loads from the stack, except for the special 558 // cases we handle. 559 addLegalFPImmediate(APFloat(+0.0)); // xorpd 560 addLegalFPImmediate(APFloat(+0.0f)); // xorps 561 } else if (!UseSoftFloat && X86ScalarSSEf32) { 562 // Use SSE for f32, x87 for f64. 563 // Set up the FP register classes. 564 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 565 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 566 567 // Use ANDPS to simulate FABS. 568 setOperationAction(ISD::FABS , MVT::f32, Custom); 569 570 // Use XORP to simulate FNEG. 571 setOperationAction(ISD::FNEG , MVT::f32, Custom); 572 573 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 574 575 // Use ANDPS and ORPS to simulate FCOPYSIGN. 576 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 577 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 578 579 // We don't support sin/cos/fmod 580 setOperationAction(ISD::FSIN , MVT::f32, Expand); 581 setOperationAction(ISD::FCOS , MVT::f32, Expand); 582 583 // Special cases we handle for FP constants. 584 addLegalFPImmediate(APFloat(+0.0f)); // xorps 585 addLegalFPImmediate(APFloat(+0.0)); // FLD0 586 addLegalFPImmediate(APFloat(+1.0)); // FLD1 587 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 588 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 589 590 if (!UnsafeFPMath) { 591 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 592 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 593 } 594 } else if (!UseSoftFloat) { 595 // f32 and f64 in x87. 596 // Set up the FP register classes. 597 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 598 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 599 600 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 601 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 602 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 603 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 604 605 if (!UnsafeFPMath) { 606 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 607 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 608 } 609 addLegalFPImmediate(APFloat(+0.0)); // FLD0 610 addLegalFPImmediate(APFloat(+1.0)); // FLD1 611 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 612 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 613 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 614 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 615 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 616 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 617 } 618 619 // We don't support FMA. 620 setOperationAction(ISD::FMA, MVT::f64, Expand); 621 setOperationAction(ISD::FMA, MVT::f32, Expand); 622 623 // Long double always uses X87. 624 if (!UseSoftFloat) { 625 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 626 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 627 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 628 { 629 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 630 addLegalFPImmediate(TmpFlt); // FLD0 631 TmpFlt.changeSign(); 632 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 633 634 bool ignored; 635 APFloat TmpFlt2(+1.0); 636 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 637 &ignored); 638 addLegalFPImmediate(TmpFlt2); // FLD1 639 TmpFlt2.changeSign(); 640 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 641 } 642 643 if (!UnsafeFPMath) { 644 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 645 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 646 } 647 648 setOperationAction(ISD::FMA, MVT::f80, Expand); 649 } 650 651 // Always use a library call for pow. 652 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 653 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 654 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 655 656 setOperationAction(ISD::FLOG, MVT::f80, Expand); 657 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 658 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 659 setOperationAction(ISD::FEXP, MVT::f80, Expand); 660 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 661 662 // First set operation action for all vector types to either promote 663 // (for widening) or expand (for scalarization). Then we will selectively 664 // turn on ones that can be effectively codegen'd. 665 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 666 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 667 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 668 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 669 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 670 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 671 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 672 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 673 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 674 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 675 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 676 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 677 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 678 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 679 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 680 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 681 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 682 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 683 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 684 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 685 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 686 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 687 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 702 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 704 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 705 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 711 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 717 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 721 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 722 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 723 setTruncStoreAction((MVT::SimpleValueType)VT, 724 (MVT::SimpleValueType)InnerVT, Expand); 725 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 726 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 727 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 728 } 729 730 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 731 // with -msoft-float, disable use of MMX as well. 732 if (!UseSoftFloat && Subtarget->hasMMX()) { 733 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 734 // No operations on x86mmx supported, everything uses intrinsics. 735 } 736 737 // MMX-sized vectors (other than x86mmx) are expected to be expanded 738 // into smaller operations. 739 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 740 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 741 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 742 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 743 setOperationAction(ISD::AND, MVT::v8i8, Expand); 744 setOperationAction(ISD::AND, MVT::v4i16, Expand); 745 setOperationAction(ISD::AND, MVT::v2i32, Expand); 746 setOperationAction(ISD::AND, MVT::v1i64, Expand); 747 setOperationAction(ISD::OR, MVT::v8i8, Expand); 748 setOperationAction(ISD::OR, MVT::v4i16, Expand); 749 setOperationAction(ISD::OR, MVT::v2i32, Expand); 750 setOperationAction(ISD::OR, MVT::v1i64, Expand); 751 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 752 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 753 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 754 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 755 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 756 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 757 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 758 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 759 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 760 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 761 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 762 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 763 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 764 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 765 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 766 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 767 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 768 769 if (!UseSoftFloat && Subtarget->hasXMM()) { 770 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 771 772 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 773 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 774 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 775 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 776 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 777 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 778 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 779 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 781 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 782 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 783 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 784 } 785 786 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 787 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 788 789 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 790 // registers cannot be used even for integer operations. 791 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 792 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 793 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 794 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 795 796 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 797 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 798 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 799 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 800 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 801 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 802 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 803 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 804 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 805 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 806 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 807 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 808 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 809 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 810 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 811 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 812 813 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 814 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 815 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 816 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 817 818 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 819 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 820 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 821 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 822 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 823 824 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 825 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 826 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 827 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 828 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 829 830 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 831 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 832 EVT VT = (MVT::SimpleValueType)i; 833 // Do not attempt to custom lower non-power-of-2 vectors 834 if (!isPowerOf2_32(VT.getVectorNumElements())) 835 continue; 836 // Do not attempt to custom lower non-128-bit vectors 837 if (!VT.is128BitVector()) 838 continue; 839 setOperationAction(ISD::BUILD_VECTOR, 840 VT.getSimpleVT().SimpleTy, Custom); 841 setOperationAction(ISD::VECTOR_SHUFFLE, 842 VT.getSimpleVT().SimpleTy, Custom); 843 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 844 VT.getSimpleVT().SimpleTy, Custom); 845 } 846 847 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 848 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 849 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 850 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 852 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 853 854 if (Subtarget->is64Bit()) { 855 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 857 } 858 859 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 860 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 861 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 862 EVT VT = SVT; 863 864 // Do not attempt to promote non-128-bit vectors 865 if (!VT.is128BitVector()) 866 continue; 867 868 setOperationAction(ISD::AND, SVT, Promote); 869 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 870 setOperationAction(ISD::OR, SVT, Promote); 871 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 872 setOperationAction(ISD::XOR, SVT, Promote); 873 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 874 setOperationAction(ISD::LOAD, SVT, Promote); 875 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 876 setOperationAction(ISD::SELECT, SVT, Promote); 877 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 878 } 879 880 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 881 882 // Custom lower v2i64 and v2f64 selects. 883 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 884 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 885 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 886 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 887 888 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 889 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 890 } 891 892 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 893 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 894 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 895 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 896 setOperationAction(ISD::FRINT, MVT::f32, Legal); 897 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 898 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 899 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 900 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 901 setOperationAction(ISD::FRINT, MVT::f64, Legal); 902 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 903 904 // FIXME: Do we need to handle scalar-to-vector here? 905 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 906 907 // Can turn SHL into an integer multiply. 908 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 909 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 910 911 // i8 and i16 vectors are custom , because the source register and source 912 // source memory operand types are not the same width. f32 vectors are 913 // custom since the immediate controlling the insert encodes additional 914 // information. 915 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 918 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 919 920 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 921 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 922 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 923 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 924 925 if (Subtarget->is64Bit()) { 926 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 927 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 928 } 929 } 930 931 if (Subtarget->hasSSE2() || Subtarget->hasAVX()) { 932 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 933 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 934 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 935 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 936 937 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 938 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 939 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 940 941 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 942 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 943 } 944 945 if (Subtarget->hasSSE42() || Subtarget->hasAVX()) 946 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 947 948 if (!UseSoftFloat && Subtarget->hasAVX()) { 949 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 950 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 951 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 952 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 953 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 954 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 955 956 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 957 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 958 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 959 960 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 961 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 962 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 963 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 964 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 965 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 966 967 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 968 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 969 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 970 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 971 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 972 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 973 974 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 975 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 976 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 977 978 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); 979 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); 980 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 981 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 982 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); 983 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); 984 985 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 986 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 987 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 988 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 989 990 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 991 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 992 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 993 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 994 995 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 996 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 997 998 setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 999 setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 1000 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 1001 setOperationAction(ISD::VSETCC, MVT::v4i64, Custom); 1002 1003 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1004 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1005 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1006 1007 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1008 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1009 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1010 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1011 1012 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1013 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1014 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1015 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1016 1017 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1018 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1019 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1020 // Don't lower v32i8 because there is no 128-bit byte mul 1021 1022 // Custom lower several nodes for 256-bit types. 1023 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1024 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 1025 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1026 EVT VT = SVT; 1027 1028 // Extract subvector is special because the value type 1029 // (result) is 128-bit but the source is 256-bit wide. 1030 if (VT.is128BitVector()) 1031 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 1032 1033 // Do not attempt to custom lower other non-256-bit vectors 1034 if (!VT.is256BitVector()) 1035 continue; 1036 1037 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 1038 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 1039 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 1040 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 1041 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 1042 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 1043 } 1044 1045 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1046 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 1047 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1048 EVT VT = SVT; 1049 1050 // Do not attempt to promote non-256-bit vectors 1051 if (!VT.is256BitVector()) 1052 continue; 1053 1054 setOperationAction(ISD::AND, SVT, Promote); 1055 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1056 setOperationAction(ISD::OR, SVT, Promote); 1057 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1058 setOperationAction(ISD::XOR, SVT, Promote); 1059 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1060 setOperationAction(ISD::LOAD, SVT, Promote); 1061 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1062 setOperationAction(ISD::SELECT, SVT, Promote); 1063 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1064 } 1065 } 1066 1067 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1068 // of this type with custom code. 1069 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1070 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1071 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); 1072 } 1073 1074 // We want to custom lower some of our intrinsics. 1075 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1076 1077 1078 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1079 // handle type legalization for these operations here. 1080 // 1081 // FIXME: We really should do custom legalization for addition and 1082 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1083 // than generic legalization for 64-bit multiplication-with-overflow, though. 1084 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1085 // Add/Sub/Mul with overflow operations are custom lowered. 1086 MVT VT = IntVTs[i]; 1087 setOperationAction(ISD::SADDO, VT, Custom); 1088 setOperationAction(ISD::UADDO, VT, Custom); 1089 setOperationAction(ISD::SSUBO, VT, Custom); 1090 setOperationAction(ISD::USUBO, VT, Custom); 1091 setOperationAction(ISD::SMULO, VT, Custom); 1092 setOperationAction(ISD::UMULO, VT, Custom); 1093 } 1094 1095 // There are no 8-bit 3-address imul/mul instructions 1096 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1097 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1098 1099 if (!Subtarget->is64Bit()) { 1100 // These libcalls are not available in 32-bit. 1101 setLibcallName(RTLIB::SHL_I128, 0); 1102 setLibcallName(RTLIB::SRL_I128, 0); 1103 setLibcallName(RTLIB::SRA_I128, 0); 1104 } 1105 1106 // We have target-specific dag combine patterns for the following nodes: 1107 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1108 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1109 setTargetDAGCombine(ISD::BUILD_VECTOR); 1110 setTargetDAGCombine(ISD::SELECT); 1111 setTargetDAGCombine(ISD::SHL); 1112 setTargetDAGCombine(ISD::SRA); 1113 setTargetDAGCombine(ISD::SRL); 1114 setTargetDAGCombine(ISD::OR); 1115 setTargetDAGCombine(ISD::AND); 1116 setTargetDAGCombine(ISD::ADD); 1117 setTargetDAGCombine(ISD::SUB); 1118 setTargetDAGCombine(ISD::STORE); 1119 setTargetDAGCombine(ISD::ZERO_EXTEND); 1120 setTargetDAGCombine(ISD::SINT_TO_FP); 1121 if (Subtarget->is64Bit()) 1122 setTargetDAGCombine(ISD::MUL); 1123 1124 computeRegisterProperties(); 1125 1126 // On Darwin, -Os means optimize for size without hurting performance, 1127 // do not reduce the limit. 1128 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1129 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1130 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1131 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1132 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1133 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1134 setPrefLoopAlignment(16); 1135 benefitFromCodePlacementOpt = true; 1136 1137 setPrefFunctionAlignment(4); 1138} 1139 1140 1141MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1142 return MVT::i8; 1143} 1144 1145 1146/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1147/// the desired ByVal argument alignment. 1148static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1149 if (MaxAlign == 16) 1150 return; 1151 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1152 if (VTy->getBitWidth() == 128) 1153 MaxAlign = 16; 1154 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1155 unsigned EltAlign = 0; 1156 getMaxByValAlign(ATy->getElementType(), EltAlign); 1157 if (EltAlign > MaxAlign) 1158 MaxAlign = EltAlign; 1159 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1160 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1161 unsigned EltAlign = 0; 1162 getMaxByValAlign(STy->getElementType(i), EltAlign); 1163 if (EltAlign > MaxAlign) 1164 MaxAlign = EltAlign; 1165 if (MaxAlign == 16) 1166 break; 1167 } 1168 } 1169 return; 1170} 1171 1172/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1173/// function arguments in the caller parameter area. For X86, aggregates 1174/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1175/// are at 4-byte boundaries. 1176unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1177 if (Subtarget->is64Bit()) { 1178 // Max of 8 and alignment of type. 1179 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1180 if (TyAlign > 8) 1181 return TyAlign; 1182 return 8; 1183 } 1184 1185 unsigned Align = 4; 1186 if (Subtarget->hasXMM()) 1187 getMaxByValAlign(Ty, Align); 1188 return Align; 1189} 1190 1191/// getOptimalMemOpType - Returns the target specific optimal type for load 1192/// and store operations as a result of memset, memcpy, and memmove 1193/// lowering. If DstAlign is zero that means it's safe to destination 1194/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1195/// means there isn't a need to check it against alignment requirement, 1196/// probably because the source does not need to be loaded. If 1197/// 'NonScalarIntSafe' is true, that means it's safe to return a 1198/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1199/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1200/// constant so it does not need to be loaded. 1201/// It returns EVT::Other if the type should be determined using generic 1202/// target-independent logic. 1203EVT 1204X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1205 unsigned DstAlign, unsigned SrcAlign, 1206 bool NonScalarIntSafe, 1207 bool MemcpyStrSrc, 1208 MachineFunction &MF) const { 1209 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1210 // linux. This is because the stack realignment code can't handle certain 1211 // cases like PR2962. This should be removed when PR2962 is fixed. 1212 const Function *F = MF.getFunction(); 1213 if (NonScalarIntSafe && 1214 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1215 if (Size >= 16 && 1216 (Subtarget->isUnalignedMemAccessFast() || 1217 ((DstAlign == 0 || DstAlign >= 16) && 1218 (SrcAlign == 0 || SrcAlign >= 16))) && 1219 Subtarget->getStackAlignment() >= 16) { 1220 if (Subtarget->hasSSE2()) 1221 return MVT::v4i32; 1222 if (Subtarget->hasSSE1()) 1223 return MVT::v4f32; 1224 } else if (!MemcpyStrSrc && Size >= 8 && 1225 !Subtarget->is64Bit() && 1226 Subtarget->getStackAlignment() >= 8 && 1227 Subtarget->hasXMMInt()) { 1228 // Do not use f64 to lower memcpy if source is string constant. It's 1229 // better to use i32 to avoid the loads. 1230 return MVT::f64; 1231 } 1232 } 1233 if (Subtarget->is64Bit() && Size >= 8) 1234 return MVT::i64; 1235 return MVT::i32; 1236} 1237 1238/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1239/// current function. The returned value is a member of the 1240/// MachineJumpTableInfo::JTEntryKind enum. 1241unsigned X86TargetLowering::getJumpTableEncoding() const { 1242 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1243 // symbol. 1244 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1245 Subtarget->isPICStyleGOT()) 1246 return MachineJumpTableInfo::EK_Custom32; 1247 1248 // Otherwise, use the normal jump table encoding heuristics. 1249 return TargetLowering::getJumpTableEncoding(); 1250} 1251 1252const MCExpr * 1253X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1254 const MachineBasicBlock *MBB, 1255 unsigned uid,MCContext &Ctx) const{ 1256 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1257 Subtarget->isPICStyleGOT()); 1258 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1259 // entries. 1260 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1261 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1262} 1263 1264/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1265/// jumptable. 1266SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1267 SelectionDAG &DAG) const { 1268 if (!Subtarget->is64Bit()) 1269 // This doesn't have DebugLoc associated with it, but is not really the 1270 // same as a Register. 1271 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1272 return Table; 1273} 1274 1275/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1276/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1277/// MCExpr. 1278const MCExpr *X86TargetLowering:: 1279getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1280 MCContext &Ctx) const { 1281 // X86-64 uses RIP relative addressing based on the jump table label. 1282 if (Subtarget->isPICStyleRIPRel()) 1283 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1284 1285 // Otherwise, the reference is relative to the PIC base. 1286 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1287} 1288 1289// FIXME: Why this routine is here? Move to RegInfo! 1290std::pair<const TargetRegisterClass*, uint8_t> 1291X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1292 const TargetRegisterClass *RRC = 0; 1293 uint8_t Cost = 1; 1294 switch (VT.getSimpleVT().SimpleTy) { 1295 default: 1296 return TargetLowering::findRepresentativeClass(VT); 1297 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1298 RRC = (Subtarget->is64Bit() 1299 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1300 break; 1301 case MVT::x86mmx: 1302 RRC = X86::VR64RegisterClass; 1303 break; 1304 case MVT::f32: case MVT::f64: 1305 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1306 case MVT::v4f32: case MVT::v2f64: 1307 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1308 case MVT::v4f64: 1309 RRC = X86::VR128RegisterClass; 1310 break; 1311 } 1312 return std::make_pair(RRC, Cost); 1313} 1314 1315bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1316 unsigned &Offset) const { 1317 if (!Subtarget->isTargetLinux()) 1318 return false; 1319 1320 if (Subtarget->is64Bit()) { 1321 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1322 Offset = 0x28; 1323 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1324 AddressSpace = 256; 1325 else 1326 AddressSpace = 257; 1327 } else { 1328 // %gs:0x14 on i386 1329 Offset = 0x14; 1330 AddressSpace = 256; 1331 } 1332 return true; 1333} 1334 1335 1336//===----------------------------------------------------------------------===// 1337// Return Value Calling Convention Implementation 1338//===----------------------------------------------------------------------===// 1339 1340#include "X86GenCallingConv.inc" 1341 1342bool 1343X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1344 MachineFunction &MF, bool isVarArg, 1345 const SmallVectorImpl<ISD::OutputArg> &Outs, 1346 LLVMContext &Context) const { 1347 SmallVector<CCValAssign, 16> RVLocs; 1348 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1349 RVLocs, Context); 1350 return CCInfo.CheckReturn(Outs, RetCC_X86); 1351} 1352 1353SDValue 1354X86TargetLowering::LowerReturn(SDValue Chain, 1355 CallingConv::ID CallConv, bool isVarArg, 1356 const SmallVectorImpl<ISD::OutputArg> &Outs, 1357 const SmallVectorImpl<SDValue> &OutVals, 1358 DebugLoc dl, SelectionDAG &DAG) const { 1359 MachineFunction &MF = DAG.getMachineFunction(); 1360 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1361 1362 SmallVector<CCValAssign, 16> RVLocs; 1363 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1364 RVLocs, *DAG.getContext()); 1365 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1366 1367 // Add the regs to the liveout set for the function. 1368 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1369 for (unsigned i = 0; i != RVLocs.size(); ++i) 1370 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1371 MRI.addLiveOut(RVLocs[i].getLocReg()); 1372 1373 SDValue Flag; 1374 1375 SmallVector<SDValue, 6> RetOps; 1376 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1377 // Operand #1 = Bytes To Pop 1378 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1379 MVT::i16)); 1380 1381 // Copy the result values into the output registers. 1382 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1383 CCValAssign &VA = RVLocs[i]; 1384 assert(VA.isRegLoc() && "Can only return in registers!"); 1385 SDValue ValToCopy = OutVals[i]; 1386 EVT ValVT = ValToCopy.getValueType(); 1387 1388 // If this is x86-64, and we disabled SSE, we can't return FP values, 1389 // or SSE or MMX vectors. 1390 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1391 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1392 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1393 report_fatal_error("SSE register return with SSE disabled"); 1394 } 1395 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1396 // llvm-gcc has never done it right and no one has noticed, so this 1397 // should be OK for now. 1398 if (ValVT == MVT::f64 && 1399 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1400 report_fatal_error("SSE2 register return with SSE2 disabled"); 1401 1402 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1403 // the RET instruction and handled by the FP Stackifier. 1404 if (VA.getLocReg() == X86::ST0 || 1405 VA.getLocReg() == X86::ST1) { 1406 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1407 // change the value to the FP stack register class. 1408 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1409 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1410 RetOps.push_back(ValToCopy); 1411 // Don't emit a copytoreg. 1412 continue; 1413 } 1414 1415 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1416 // which is returned in RAX / RDX. 1417 if (Subtarget->is64Bit()) { 1418 if (ValVT == MVT::x86mmx) { 1419 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1420 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1421 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1422 ValToCopy); 1423 // If we don't have SSE2 available, convert to v4f32 so the generated 1424 // register is legal. 1425 if (!Subtarget->hasSSE2()) 1426 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1427 } 1428 } 1429 } 1430 1431 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1432 Flag = Chain.getValue(1); 1433 } 1434 1435 // The x86-64 ABI for returning structs by value requires that we copy 1436 // the sret argument into %rax for the return. We saved the argument into 1437 // a virtual register in the entry block, so now we copy the value out 1438 // and into %rax. 1439 if (Subtarget->is64Bit() && 1440 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1441 MachineFunction &MF = DAG.getMachineFunction(); 1442 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1443 unsigned Reg = FuncInfo->getSRetReturnReg(); 1444 assert(Reg && 1445 "SRetReturnReg should have been set in LowerFormalArguments()."); 1446 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1447 1448 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1449 Flag = Chain.getValue(1); 1450 1451 // RAX now acts like a return value. 1452 MRI.addLiveOut(X86::RAX); 1453 } 1454 1455 RetOps[0] = Chain; // Update chain. 1456 1457 // Add the flag if we have it. 1458 if (Flag.getNode()) 1459 RetOps.push_back(Flag); 1460 1461 return DAG.getNode(X86ISD::RET_FLAG, dl, 1462 MVT::Other, &RetOps[0], RetOps.size()); 1463} 1464 1465bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1466 if (N->getNumValues() != 1) 1467 return false; 1468 if (!N->hasNUsesOfValue(1, 0)) 1469 return false; 1470 1471 SDNode *Copy = *N->use_begin(); 1472 if (Copy->getOpcode() != ISD::CopyToReg && 1473 Copy->getOpcode() != ISD::FP_EXTEND) 1474 return false; 1475 1476 bool HasRet = false; 1477 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1478 UI != UE; ++UI) { 1479 if (UI->getOpcode() != X86ISD::RET_FLAG) 1480 return false; 1481 HasRet = true; 1482 } 1483 1484 return HasRet; 1485} 1486 1487EVT 1488X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1489 ISD::NodeType ExtendKind) const { 1490 MVT ReturnMVT; 1491 // TODO: Is this also valid on 32-bit? 1492 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1493 ReturnMVT = MVT::i8; 1494 else 1495 ReturnMVT = MVT::i32; 1496 1497 EVT MinVT = getRegisterType(Context, ReturnMVT); 1498 return VT.bitsLT(MinVT) ? MinVT : VT; 1499} 1500 1501/// LowerCallResult - Lower the result values of a call into the 1502/// appropriate copies out of appropriate physical registers. 1503/// 1504SDValue 1505X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1506 CallingConv::ID CallConv, bool isVarArg, 1507 const SmallVectorImpl<ISD::InputArg> &Ins, 1508 DebugLoc dl, SelectionDAG &DAG, 1509 SmallVectorImpl<SDValue> &InVals) const { 1510 1511 // Assign locations to each value returned by this call. 1512 SmallVector<CCValAssign, 16> RVLocs; 1513 bool Is64Bit = Subtarget->is64Bit(); 1514 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1515 getTargetMachine(), RVLocs, *DAG.getContext()); 1516 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1517 1518 // Copy all of the result registers out of their specified physreg. 1519 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1520 CCValAssign &VA = RVLocs[i]; 1521 EVT CopyVT = VA.getValVT(); 1522 1523 // If this is x86-64, and we disabled SSE, we can't return FP values 1524 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1525 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1526 report_fatal_error("SSE register return with SSE disabled"); 1527 } 1528 1529 SDValue Val; 1530 1531 // If this is a call to a function that returns an fp value on the floating 1532 // point stack, we must guarantee the the value is popped from the stack, so 1533 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1534 // if the return value is not used. We use the FpPOP_RETVAL instruction 1535 // instead. 1536 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1537 // If we prefer to use the value in xmm registers, copy it out as f80 and 1538 // use a truncate to move it from fp stack reg to xmm reg. 1539 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1540 SDValue Ops[] = { Chain, InFlag }; 1541 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1542 MVT::Other, MVT::Glue, Ops, 2), 1); 1543 Val = Chain.getValue(0); 1544 1545 // Round the f80 to the right size, which also moves it to the appropriate 1546 // xmm register. 1547 if (CopyVT != VA.getValVT()) 1548 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1549 // This truncation won't change the value. 1550 DAG.getIntPtrConstant(1)); 1551 } else { 1552 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1553 CopyVT, InFlag).getValue(1); 1554 Val = Chain.getValue(0); 1555 } 1556 InFlag = Chain.getValue(2); 1557 InVals.push_back(Val); 1558 } 1559 1560 return Chain; 1561} 1562 1563 1564//===----------------------------------------------------------------------===// 1565// C & StdCall & Fast Calling Convention implementation 1566//===----------------------------------------------------------------------===// 1567// StdCall calling convention seems to be standard for many Windows' API 1568// routines and around. It differs from C calling convention just a little: 1569// callee should clean up the stack, not caller. Symbols should be also 1570// decorated in some fancy way :) It doesn't support any vector arguments. 1571// For info on fast calling convention see Fast Calling Convention (tail call) 1572// implementation LowerX86_32FastCCCallTo. 1573 1574/// CallIsStructReturn - Determines whether a call uses struct return 1575/// semantics. 1576static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1577 if (Outs.empty()) 1578 return false; 1579 1580 return Outs[0].Flags.isSRet(); 1581} 1582 1583/// ArgsAreStructReturn - Determines whether a function uses struct 1584/// return semantics. 1585static bool 1586ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1587 if (Ins.empty()) 1588 return false; 1589 1590 return Ins[0].Flags.isSRet(); 1591} 1592 1593/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1594/// by "Src" to address "Dst" with size and alignment information specified by 1595/// the specific parameter attribute. The copy will be passed as a byval 1596/// function parameter. 1597static SDValue 1598CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1599 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1600 DebugLoc dl) { 1601 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1602 1603 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1604 /*isVolatile*/false, /*AlwaysInline=*/true, 1605 MachinePointerInfo(), MachinePointerInfo()); 1606} 1607 1608/// IsTailCallConvention - Return true if the calling convention is one that 1609/// supports tail call optimization. 1610static bool IsTailCallConvention(CallingConv::ID CC) { 1611 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1612} 1613 1614bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1615 if (!CI->isTailCall()) 1616 return false; 1617 1618 CallSite CS(CI); 1619 CallingConv::ID CalleeCC = CS.getCallingConv(); 1620 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1621 return false; 1622 1623 return true; 1624} 1625 1626/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1627/// a tailcall target by changing its ABI. 1628static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1629 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1630} 1631 1632SDValue 1633X86TargetLowering::LowerMemArgument(SDValue Chain, 1634 CallingConv::ID CallConv, 1635 const SmallVectorImpl<ISD::InputArg> &Ins, 1636 DebugLoc dl, SelectionDAG &DAG, 1637 const CCValAssign &VA, 1638 MachineFrameInfo *MFI, 1639 unsigned i) const { 1640 // Create the nodes corresponding to a load from this parameter slot. 1641 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1642 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1643 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1644 EVT ValVT; 1645 1646 // If value is passed by pointer we have address passed instead of the value 1647 // itself. 1648 if (VA.getLocInfo() == CCValAssign::Indirect) 1649 ValVT = VA.getLocVT(); 1650 else 1651 ValVT = VA.getValVT(); 1652 1653 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1654 // changed with more analysis. 1655 // In case of tail call optimization mark all arguments mutable. Since they 1656 // could be overwritten by lowering of arguments in case of a tail call. 1657 if (Flags.isByVal()) { 1658 unsigned Bytes = Flags.getByValSize(); 1659 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1660 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1661 return DAG.getFrameIndex(FI, getPointerTy()); 1662 } else { 1663 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1664 VA.getLocMemOffset(), isImmutable); 1665 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1666 return DAG.getLoad(ValVT, dl, Chain, FIN, 1667 MachinePointerInfo::getFixedStack(FI), 1668 false, false, 0); 1669 } 1670} 1671 1672SDValue 1673X86TargetLowering::LowerFormalArguments(SDValue Chain, 1674 CallingConv::ID CallConv, 1675 bool isVarArg, 1676 const SmallVectorImpl<ISD::InputArg> &Ins, 1677 DebugLoc dl, 1678 SelectionDAG &DAG, 1679 SmallVectorImpl<SDValue> &InVals) 1680 const { 1681 MachineFunction &MF = DAG.getMachineFunction(); 1682 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1683 1684 const Function* Fn = MF.getFunction(); 1685 if (Fn->hasExternalLinkage() && 1686 Subtarget->isTargetCygMing() && 1687 Fn->getName() == "main") 1688 FuncInfo->setForceFramePointer(true); 1689 1690 MachineFrameInfo *MFI = MF.getFrameInfo(); 1691 bool Is64Bit = Subtarget->is64Bit(); 1692 bool IsWin64 = Subtarget->isTargetWin64(); 1693 1694 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1695 "Var args not supported with calling convention fastcc or ghc"); 1696 1697 // Assign locations to all of the incoming arguments. 1698 SmallVector<CCValAssign, 16> ArgLocs; 1699 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1700 ArgLocs, *DAG.getContext()); 1701 1702 // Allocate shadow area for Win64 1703 if (IsWin64) { 1704 CCInfo.AllocateStack(32, 8); 1705 } 1706 1707 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1708 1709 unsigned LastVal = ~0U; 1710 SDValue ArgValue; 1711 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1712 CCValAssign &VA = ArgLocs[i]; 1713 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1714 // places. 1715 assert(VA.getValNo() != LastVal && 1716 "Don't support value assigned to multiple locs yet"); 1717 LastVal = VA.getValNo(); 1718 1719 if (VA.isRegLoc()) { 1720 EVT RegVT = VA.getLocVT(); 1721 TargetRegisterClass *RC = NULL; 1722 if (RegVT == MVT::i32) 1723 RC = X86::GR32RegisterClass; 1724 else if (Is64Bit && RegVT == MVT::i64) 1725 RC = X86::GR64RegisterClass; 1726 else if (RegVT == MVT::f32) 1727 RC = X86::FR32RegisterClass; 1728 else if (RegVT == MVT::f64) 1729 RC = X86::FR64RegisterClass; 1730 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1731 RC = X86::VR256RegisterClass; 1732 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1733 RC = X86::VR128RegisterClass; 1734 else if (RegVT == MVT::x86mmx) 1735 RC = X86::VR64RegisterClass; 1736 else 1737 llvm_unreachable("Unknown argument type!"); 1738 1739 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1740 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1741 1742 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1743 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1744 // right size. 1745 if (VA.getLocInfo() == CCValAssign::SExt) 1746 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1747 DAG.getValueType(VA.getValVT())); 1748 else if (VA.getLocInfo() == CCValAssign::ZExt) 1749 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1750 DAG.getValueType(VA.getValVT())); 1751 else if (VA.getLocInfo() == CCValAssign::BCvt) 1752 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1753 1754 if (VA.isExtInLoc()) { 1755 // Handle MMX values passed in XMM regs. 1756 if (RegVT.isVector()) { 1757 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1758 ArgValue); 1759 } else 1760 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1761 } 1762 } else { 1763 assert(VA.isMemLoc()); 1764 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1765 } 1766 1767 // If value is passed via pointer - do a load. 1768 if (VA.getLocInfo() == CCValAssign::Indirect) 1769 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1770 MachinePointerInfo(), false, false, 0); 1771 1772 InVals.push_back(ArgValue); 1773 } 1774 1775 // The x86-64 ABI for returning structs by value requires that we copy 1776 // the sret argument into %rax for the return. Save the argument into 1777 // a virtual register so that we can access it from the return points. 1778 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1779 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1780 unsigned Reg = FuncInfo->getSRetReturnReg(); 1781 if (!Reg) { 1782 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1783 FuncInfo->setSRetReturnReg(Reg); 1784 } 1785 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1786 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1787 } 1788 1789 unsigned StackSize = CCInfo.getNextStackOffset(); 1790 // Align stack specially for tail calls. 1791 if (FuncIsMadeTailCallSafe(CallConv)) 1792 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1793 1794 // If the function takes variable number of arguments, make a frame index for 1795 // the start of the first vararg value... for expansion of llvm.va_start. 1796 if (isVarArg) { 1797 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1798 CallConv != CallingConv::X86_ThisCall)) { 1799 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1800 } 1801 if (Is64Bit) { 1802 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1803 1804 // FIXME: We should really autogenerate these arrays 1805 static const unsigned GPR64ArgRegsWin64[] = { 1806 X86::RCX, X86::RDX, X86::R8, X86::R9 1807 }; 1808 static const unsigned GPR64ArgRegs64Bit[] = { 1809 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1810 }; 1811 static const unsigned XMMArgRegs64Bit[] = { 1812 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1813 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1814 }; 1815 const unsigned *GPR64ArgRegs; 1816 unsigned NumXMMRegs = 0; 1817 1818 if (IsWin64) { 1819 // The XMM registers which might contain var arg parameters are shadowed 1820 // in their paired GPR. So we only need to save the GPR to their home 1821 // slots. 1822 TotalNumIntRegs = 4; 1823 GPR64ArgRegs = GPR64ArgRegsWin64; 1824 } else { 1825 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1826 GPR64ArgRegs = GPR64ArgRegs64Bit; 1827 1828 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1829 } 1830 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1831 TotalNumIntRegs); 1832 1833 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1834 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1835 "SSE register cannot be used when SSE is disabled!"); 1836 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1837 "SSE register cannot be used when SSE is disabled!"); 1838 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1839 // Kernel mode asks for SSE to be disabled, so don't push them 1840 // on the stack. 1841 TotalNumXMMRegs = 0; 1842 1843 if (IsWin64) { 1844 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1845 // Get to the caller-allocated home save location. Add 8 to account 1846 // for the return address. 1847 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1848 FuncInfo->setRegSaveFrameIndex( 1849 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1850 // Fixup to set vararg frame on shadow area (4 x i64). 1851 if (NumIntRegs < 4) 1852 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1853 } else { 1854 // For X86-64, if there are vararg parameters that are passed via 1855 // registers, then we must store them to their spots on the stack so they 1856 // may be loaded by deferencing the result of va_next. 1857 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1858 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1859 FuncInfo->setRegSaveFrameIndex( 1860 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1861 false)); 1862 } 1863 1864 // Store the integer parameter registers. 1865 SmallVector<SDValue, 8> MemOps; 1866 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1867 getPointerTy()); 1868 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1869 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1870 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1871 DAG.getIntPtrConstant(Offset)); 1872 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1873 X86::GR64RegisterClass); 1874 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1875 SDValue Store = 1876 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1877 MachinePointerInfo::getFixedStack( 1878 FuncInfo->getRegSaveFrameIndex(), Offset), 1879 false, false, 0); 1880 MemOps.push_back(Store); 1881 Offset += 8; 1882 } 1883 1884 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1885 // Now store the XMM (fp + vector) parameter registers. 1886 SmallVector<SDValue, 11> SaveXMMOps; 1887 SaveXMMOps.push_back(Chain); 1888 1889 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1890 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1891 SaveXMMOps.push_back(ALVal); 1892 1893 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1894 FuncInfo->getRegSaveFrameIndex())); 1895 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1896 FuncInfo->getVarArgsFPOffset())); 1897 1898 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1899 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1900 X86::VR128RegisterClass); 1901 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1902 SaveXMMOps.push_back(Val); 1903 } 1904 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1905 MVT::Other, 1906 &SaveXMMOps[0], SaveXMMOps.size())); 1907 } 1908 1909 if (!MemOps.empty()) 1910 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1911 &MemOps[0], MemOps.size()); 1912 } 1913 } 1914 1915 // Some CCs need callee pop. 1916 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { 1917 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1918 } else { 1919 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1920 // If this is an sret function, the return should pop the hidden pointer. 1921 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1922 FuncInfo->setBytesToPopOnReturn(4); 1923 } 1924 1925 if (!Is64Bit) { 1926 // RegSaveFrameIndex is X86-64 only. 1927 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1928 if (CallConv == CallingConv::X86_FastCall || 1929 CallConv == CallingConv::X86_ThisCall) 1930 // fastcc functions can't have varargs. 1931 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1932 } 1933 1934 return Chain; 1935} 1936 1937SDValue 1938X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1939 SDValue StackPtr, SDValue Arg, 1940 DebugLoc dl, SelectionDAG &DAG, 1941 const CCValAssign &VA, 1942 ISD::ArgFlagsTy Flags) const { 1943 unsigned LocMemOffset = VA.getLocMemOffset(); 1944 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1945 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1946 if (Flags.isByVal()) 1947 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1948 1949 return DAG.getStore(Chain, dl, Arg, PtrOff, 1950 MachinePointerInfo::getStack(LocMemOffset), 1951 false, false, 0); 1952} 1953 1954/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1955/// optimization is performed and it is required. 1956SDValue 1957X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1958 SDValue &OutRetAddr, SDValue Chain, 1959 bool IsTailCall, bool Is64Bit, 1960 int FPDiff, DebugLoc dl) const { 1961 // Adjust the Return address stack slot. 1962 EVT VT = getPointerTy(); 1963 OutRetAddr = getReturnAddressFrameIndex(DAG); 1964 1965 // Load the "old" Return address. 1966 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1967 false, false, 0); 1968 return SDValue(OutRetAddr.getNode(), 1); 1969} 1970 1971/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1972/// optimization is performed and it is required (FPDiff!=0). 1973static SDValue 1974EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1975 SDValue Chain, SDValue RetAddrFrIdx, 1976 bool Is64Bit, int FPDiff, DebugLoc dl) { 1977 // Store the return address to the appropriate stack slot. 1978 if (!FPDiff) return Chain; 1979 // Calculate the new stack slot for the return address. 1980 int SlotSize = Is64Bit ? 8 : 4; 1981 int NewReturnAddrFI = 1982 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1983 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1984 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1985 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1986 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1987 false, false, 0); 1988 return Chain; 1989} 1990 1991SDValue 1992X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1993 CallingConv::ID CallConv, bool isVarArg, 1994 bool &isTailCall, 1995 const SmallVectorImpl<ISD::OutputArg> &Outs, 1996 const SmallVectorImpl<SDValue> &OutVals, 1997 const SmallVectorImpl<ISD::InputArg> &Ins, 1998 DebugLoc dl, SelectionDAG &DAG, 1999 SmallVectorImpl<SDValue> &InVals) const { 2000 MachineFunction &MF = DAG.getMachineFunction(); 2001 bool Is64Bit = Subtarget->is64Bit(); 2002 bool IsWin64 = Subtarget->isTargetWin64(); 2003 bool IsStructRet = CallIsStructReturn(Outs); 2004 bool IsSibcall = false; 2005 2006 if (isTailCall) { 2007 // Check if it's really possible to do a tail call. 2008 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2009 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 2010 Outs, OutVals, Ins, DAG); 2011 2012 // Sibcalls are automatically detected tailcalls which do not require 2013 // ABI changes. 2014 if (!GuaranteedTailCallOpt && isTailCall) 2015 IsSibcall = true; 2016 2017 if (isTailCall) 2018 ++NumTailCalls; 2019 } 2020 2021 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2022 "Var args not supported with calling convention fastcc or ghc"); 2023 2024 // Analyze operands of the call, assigning locations to each operand. 2025 SmallVector<CCValAssign, 16> ArgLocs; 2026 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2027 ArgLocs, *DAG.getContext()); 2028 2029 // Allocate shadow area for Win64 2030 if (IsWin64) { 2031 CCInfo.AllocateStack(32, 8); 2032 } 2033 2034 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2035 2036 // Get a count of how many bytes are to be pushed on the stack. 2037 unsigned NumBytes = CCInfo.getNextStackOffset(); 2038 if (IsSibcall) 2039 // This is a sibcall. The memory operands are available in caller's 2040 // own caller's stack. 2041 NumBytes = 0; 2042 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 2043 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2044 2045 int FPDiff = 0; 2046 if (isTailCall && !IsSibcall) { 2047 // Lower arguments at fp - stackoffset + fpdiff. 2048 unsigned NumBytesCallerPushed = 2049 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2050 FPDiff = NumBytesCallerPushed - NumBytes; 2051 2052 // Set the delta of movement of the returnaddr stackslot. 2053 // But only set if delta is greater than previous delta. 2054 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2055 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2056 } 2057 2058 if (!IsSibcall) 2059 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2060 2061 SDValue RetAddrFrIdx; 2062 // Load return address for tail calls. 2063 if (isTailCall && FPDiff) 2064 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2065 Is64Bit, FPDiff, dl); 2066 2067 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2068 SmallVector<SDValue, 8> MemOpChains; 2069 SDValue StackPtr; 2070 2071 // Walk the register/memloc assignments, inserting copies/loads. In the case 2072 // of tail call optimization arguments are handle later. 2073 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2074 CCValAssign &VA = ArgLocs[i]; 2075 EVT RegVT = VA.getLocVT(); 2076 SDValue Arg = OutVals[i]; 2077 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2078 bool isByVal = Flags.isByVal(); 2079 2080 // Promote the value if needed. 2081 switch (VA.getLocInfo()) { 2082 default: llvm_unreachable("Unknown loc info!"); 2083 case CCValAssign::Full: break; 2084 case CCValAssign::SExt: 2085 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2086 break; 2087 case CCValAssign::ZExt: 2088 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2089 break; 2090 case CCValAssign::AExt: 2091 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2092 // Special case: passing MMX values in XMM registers. 2093 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2094 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2095 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2096 } else 2097 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2098 break; 2099 case CCValAssign::BCvt: 2100 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2101 break; 2102 case CCValAssign::Indirect: { 2103 // Store the argument. 2104 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2105 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2106 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2107 MachinePointerInfo::getFixedStack(FI), 2108 false, false, 0); 2109 Arg = SpillSlot; 2110 break; 2111 } 2112 } 2113 2114 if (VA.isRegLoc()) { 2115 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2116 if (isVarArg && IsWin64) { 2117 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2118 // shadow reg if callee is a varargs function. 2119 unsigned ShadowReg = 0; 2120 switch (VA.getLocReg()) { 2121 case X86::XMM0: ShadowReg = X86::RCX; break; 2122 case X86::XMM1: ShadowReg = X86::RDX; break; 2123 case X86::XMM2: ShadowReg = X86::R8; break; 2124 case X86::XMM3: ShadowReg = X86::R9; break; 2125 } 2126 if (ShadowReg) 2127 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2128 } 2129 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2130 assert(VA.isMemLoc()); 2131 if (StackPtr.getNode() == 0) 2132 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2133 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2134 dl, DAG, VA, Flags)); 2135 } 2136 } 2137 2138 if (!MemOpChains.empty()) 2139 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2140 &MemOpChains[0], MemOpChains.size()); 2141 2142 // Build a sequence of copy-to-reg nodes chained together with token chain 2143 // and flag operands which copy the outgoing args into registers. 2144 SDValue InFlag; 2145 // Tail call byval lowering might overwrite argument registers so in case of 2146 // tail call optimization the copies to registers are lowered later. 2147 if (!isTailCall) 2148 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2149 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2150 RegsToPass[i].second, InFlag); 2151 InFlag = Chain.getValue(1); 2152 } 2153 2154 if (Subtarget->isPICStyleGOT()) { 2155 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2156 // GOT pointer. 2157 if (!isTailCall) { 2158 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2159 DAG.getNode(X86ISD::GlobalBaseReg, 2160 DebugLoc(), getPointerTy()), 2161 InFlag); 2162 InFlag = Chain.getValue(1); 2163 } else { 2164 // If we are tail calling and generating PIC/GOT style code load the 2165 // address of the callee into ECX. The value in ecx is used as target of 2166 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2167 // for tail calls on PIC/GOT architectures. Normally we would just put the 2168 // address of GOT into ebx and then call target@PLT. But for tail calls 2169 // ebx would be restored (since ebx is callee saved) before jumping to the 2170 // target@PLT. 2171 2172 // Note: The actual moving to ECX is done further down. 2173 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2174 if (G && !G->getGlobal()->hasHiddenVisibility() && 2175 !G->getGlobal()->hasProtectedVisibility()) 2176 Callee = LowerGlobalAddress(Callee, DAG); 2177 else if (isa<ExternalSymbolSDNode>(Callee)) 2178 Callee = LowerExternalSymbol(Callee, DAG); 2179 } 2180 } 2181 2182 if (Is64Bit && isVarArg && !IsWin64) { 2183 // From AMD64 ABI document: 2184 // For calls that may call functions that use varargs or stdargs 2185 // (prototype-less calls or calls to functions containing ellipsis (...) in 2186 // the declaration) %al is used as hidden argument to specify the number 2187 // of SSE registers used. The contents of %al do not need to match exactly 2188 // the number of registers, but must be an ubound on the number of SSE 2189 // registers used and is in the range 0 - 8 inclusive. 2190 2191 // Count the number of XMM registers allocated. 2192 static const unsigned XMMArgRegs[] = { 2193 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2194 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2195 }; 2196 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2197 assert((Subtarget->hasXMM() || !NumXMMRegs) 2198 && "SSE registers cannot be used when SSE is disabled"); 2199 2200 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2201 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2202 InFlag = Chain.getValue(1); 2203 } 2204 2205 2206 // For tail calls lower the arguments to the 'real' stack slot. 2207 if (isTailCall) { 2208 // Force all the incoming stack arguments to be loaded from the stack 2209 // before any new outgoing arguments are stored to the stack, because the 2210 // outgoing stack slots may alias the incoming argument stack slots, and 2211 // the alias isn't otherwise explicit. This is slightly more conservative 2212 // than necessary, because it means that each store effectively depends 2213 // on every argument instead of just those arguments it would clobber. 2214 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2215 2216 SmallVector<SDValue, 8> MemOpChains2; 2217 SDValue FIN; 2218 int FI = 0; 2219 // Do not flag preceding copytoreg stuff together with the following stuff. 2220 InFlag = SDValue(); 2221 if (GuaranteedTailCallOpt) { 2222 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2223 CCValAssign &VA = ArgLocs[i]; 2224 if (VA.isRegLoc()) 2225 continue; 2226 assert(VA.isMemLoc()); 2227 SDValue Arg = OutVals[i]; 2228 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2229 // Create frame index. 2230 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2231 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2232 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2233 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2234 2235 if (Flags.isByVal()) { 2236 // Copy relative to framepointer. 2237 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2238 if (StackPtr.getNode() == 0) 2239 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2240 getPointerTy()); 2241 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2242 2243 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2244 ArgChain, 2245 Flags, DAG, dl)); 2246 } else { 2247 // Store relative to framepointer. 2248 MemOpChains2.push_back( 2249 DAG.getStore(ArgChain, dl, Arg, FIN, 2250 MachinePointerInfo::getFixedStack(FI), 2251 false, false, 0)); 2252 } 2253 } 2254 } 2255 2256 if (!MemOpChains2.empty()) 2257 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2258 &MemOpChains2[0], MemOpChains2.size()); 2259 2260 // Copy arguments to their registers. 2261 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2262 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2263 RegsToPass[i].second, InFlag); 2264 InFlag = Chain.getValue(1); 2265 } 2266 InFlag =SDValue(); 2267 2268 // Store the return address to the appropriate stack slot. 2269 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2270 FPDiff, dl); 2271 } 2272 2273 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2274 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2275 // In the 64-bit large code model, we have to make all calls 2276 // through a register, since the call instruction's 32-bit 2277 // pc-relative offset may not be large enough to hold the whole 2278 // address. 2279 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2280 // If the callee is a GlobalAddress node (quite common, every direct call 2281 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2282 // it. 2283 2284 // We should use extra load for direct calls to dllimported functions in 2285 // non-JIT mode. 2286 const GlobalValue *GV = G->getGlobal(); 2287 if (!GV->hasDLLImportLinkage()) { 2288 unsigned char OpFlags = 0; 2289 bool ExtraLoad = false; 2290 unsigned WrapperKind = ISD::DELETED_NODE; 2291 2292 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2293 // external symbols most go through the PLT in PIC mode. If the symbol 2294 // has hidden or protected visibility, or if it is static or local, then 2295 // we don't need to use the PLT - we can directly call it. 2296 if (Subtarget->isTargetELF() && 2297 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2298 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2299 OpFlags = X86II::MO_PLT; 2300 } else if (Subtarget->isPICStyleStubAny() && 2301 (GV->isDeclaration() || GV->isWeakForLinker()) && 2302 (!Subtarget->getTargetTriple().isMacOSX() || 2303 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2304 // PC-relative references to external symbols should go through $stub, 2305 // unless we're building with the leopard linker or later, which 2306 // automatically synthesizes these stubs. 2307 OpFlags = X86II::MO_DARWIN_STUB; 2308 } else if (Subtarget->isPICStyleRIPRel() && 2309 isa<Function>(GV) && 2310 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2311 // If the function is marked as non-lazy, generate an indirect call 2312 // which loads from the GOT directly. This avoids runtime overhead 2313 // at the cost of eager binding (and one extra byte of encoding). 2314 OpFlags = X86II::MO_GOTPCREL; 2315 WrapperKind = X86ISD::WrapperRIP; 2316 ExtraLoad = true; 2317 } 2318 2319 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2320 G->getOffset(), OpFlags); 2321 2322 // Add a wrapper if needed. 2323 if (WrapperKind != ISD::DELETED_NODE) 2324 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2325 // Add extra indirection if needed. 2326 if (ExtraLoad) 2327 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2328 MachinePointerInfo::getGOT(), 2329 false, false, 0); 2330 } 2331 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2332 unsigned char OpFlags = 0; 2333 2334 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2335 // external symbols should go through the PLT. 2336 if (Subtarget->isTargetELF() && 2337 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2338 OpFlags = X86II::MO_PLT; 2339 } else if (Subtarget->isPICStyleStubAny() && 2340 (!Subtarget->getTargetTriple().isMacOSX() || 2341 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2342 // PC-relative references to external symbols should go through $stub, 2343 // unless we're building with the leopard linker or later, which 2344 // automatically synthesizes these stubs. 2345 OpFlags = X86II::MO_DARWIN_STUB; 2346 } 2347 2348 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2349 OpFlags); 2350 } 2351 2352 // Returns a chain & a flag for retval copy to use. 2353 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2354 SmallVector<SDValue, 8> Ops; 2355 2356 if (!IsSibcall && isTailCall) { 2357 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2358 DAG.getIntPtrConstant(0, true), InFlag); 2359 InFlag = Chain.getValue(1); 2360 } 2361 2362 Ops.push_back(Chain); 2363 Ops.push_back(Callee); 2364 2365 if (isTailCall) 2366 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2367 2368 // Add argument registers to the end of the list so that they are known live 2369 // into the call. 2370 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2371 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2372 RegsToPass[i].second.getValueType())); 2373 2374 // Add an implicit use GOT pointer in EBX. 2375 if (!isTailCall && Subtarget->isPICStyleGOT()) 2376 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2377 2378 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2379 if (Is64Bit && isVarArg && !IsWin64) 2380 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2381 2382 if (InFlag.getNode()) 2383 Ops.push_back(InFlag); 2384 2385 if (isTailCall) { 2386 // We used to do: 2387 //// If this is the first return lowered for this function, add the regs 2388 //// to the liveout set for the function. 2389 // This isn't right, although it's probably harmless on x86; liveouts 2390 // should be computed from returns not tail calls. Consider a void 2391 // function making a tail call to a function returning int. 2392 return DAG.getNode(X86ISD::TC_RETURN, dl, 2393 NodeTys, &Ops[0], Ops.size()); 2394 } 2395 2396 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2397 InFlag = Chain.getValue(1); 2398 2399 // Create the CALLSEQ_END node. 2400 unsigned NumBytesForCalleeToPush; 2401 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) 2402 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2403 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2404 // If this is a call to a struct-return function, the callee 2405 // pops the hidden struct pointer, so we have to push it back. 2406 // This is common for Darwin/X86, Linux & Mingw32 targets. 2407 NumBytesForCalleeToPush = 4; 2408 else 2409 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2410 2411 // Returns a flag for retval copy to use. 2412 if (!IsSibcall) { 2413 Chain = DAG.getCALLSEQ_END(Chain, 2414 DAG.getIntPtrConstant(NumBytes, true), 2415 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2416 true), 2417 InFlag); 2418 InFlag = Chain.getValue(1); 2419 } 2420 2421 // Handle result values, copying them out of physregs into vregs that we 2422 // return. 2423 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2424 Ins, dl, DAG, InVals); 2425} 2426 2427 2428//===----------------------------------------------------------------------===// 2429// Fast Calling Convention (tail call) implementation 2430//===----------------------------------------------------------------------===// 2431 2432// Like std call, callee cleans arguments, convention except that ECX is 2433// reserved for storing the tail called function address. Only 2 registers are 2434// free for argument passing (inreg). Tail call optimization is performed 2435// provided: 2436// * tailcallopt is enabled 2437// * caller/callee are fastcc 2438// On X86_64 architecture with GOT-style position independent code only local 2439// (within module) calls are supported at the moment. 2440// To keep the stack aligned according to platform abi the function 2441// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2442// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2443// If a tail called function callee has more arguments than the caller the 2444// caller needs to make sure that there is room to move the RETADDR to. This is 2445// achieved by reserving an area the size of the argument delta right after the 2446// original REtADDR, but before the saved framepointer or the spilled registers 2447// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2448// stack layout: 2449// arg1 2450// arg2 2451// RETADDR 2452// [ new RETADDR 2453// move area ] 2454// (possible EBP) 2455// ESI 2456// EDI 2457// local1 .. 2458 2459/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2460/// for a 16 byte align requirement. 2461unsigned 2462X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2463 SelectionDAG& DAG) const { 2464 MachineFunction &MF = DAG.getMachineFunction(); 2465 const TargetMachine &TM = MF.getTarget(); 2466 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2467 unsigned StackAlignment = TFI.getStackAlignment(); 2468 uint64_t AlignMask = StackAlignment - 1; 2469 int64_t Offset = StackSize; 2470 uint64_t SlotSize = TD->getPointerSize(); 2471 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2472 // Number smaller than 12 so just add the difference. 2473 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2474 } else { 2475 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2476 Offset = ((~AlignMask) & Offset) + StackAlignment + 2477 (StackAlignment-SlotSize); 2478 } 2479 return Offset; 2480} 2481 2482/// MatchingStackOffset - Return true if the given stack call argument is 2483/// already available in the same position (relatively) of the caller's 2484/// incoming argument stack. 2485static 2486bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2487 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2488 const X86InstrInfo *TII) { 2489 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2490 int FI = INT_MAX; 2491 if (Arg.getOpcode() == ISD::CopyFromReg) { 2492 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2493 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2494 return false; 2495 MachineInstr *Def = MRI->getVRegDef(VR); 2496 if (!Def) 2497 return false; 2498 if (!Flags.isByVal()) { 2499 if (!TII->isLoadFromStackSlot(Def, FI)) 2500 return false; 2501 } else { 2502 unsigned Opcode = Def->getOpcode(); 2503 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2504 Def->getOperand(1).isFI()) { 2505 FI = Def->getOperand(1).getIndex(); 2506 Bytes = Flags.getByValSize(); 2507 } else 2508 return false; 2509 } 2510 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2511 if (Flags.isByVal()) 2512 // ByVal argument is passed in as a pointer but it's now being 2513 // dereferenced. e.g. 2514 // define @foo(%struct.X* %A) { 2515 // tail call @bar(%struct.X* byval %A) 2516 // } 2517 return false; 2518 SDValue Ptr = Ld->getBasePtr(); 2519 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2520 if (!FINode) 2521 return false; 2522 FI = FINode->getIndex(); 2523 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2524 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2525 FI = FINode->getIndex(); 2526 Bytes = Flags.getByValSize(); 2527 } else 2528 return false; 2529 2530 assert(FI != INT_MAX); 2531 if (!MFI->isFixedObjectIndex(FI)) 2532 return false; 2533 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2534} 2535 2536/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2537/// for tail call optimization. Targets which want to do tail call 2538/// optimization should implement this function. 2539bool 2540X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2541 CallingConv::ID CalleeCC, 2542 bool isVarArg, 2543 bool isCalleeStructRet, 2544 bool isCallerStructRet, 2545 const SmallVectorImpl<ISD::OutputArg> &Outs, 2546 const SmallVectorImpl<SDValue> &OutVals, 2547 const SmallVectorImpl<ISD::InputArg> &Ins, 2548 SelectionDAG& DAG) const { 2549 if (!IsTailCallConvention(CalleeCC) && 2550 CalleeCC != CallingConv::C) 2551 return false; 2552 2553 // If -tailcallopt is specified, make fastcc functions tail-callable. 2554 const MachineFunction &MF = DAG.getMachineFunction(); 2555 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2556 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2557 bool CCMatch = CallerCC == CalleeCC; 2558 2559 if (GuaranteedTailCallOpt) { 2560 if (IsTailCallConvention(CalleeCC) && CCMatch) 2561 return true; 2562 return false; 2563 } 2564 2565 // Look for obvious safe cases to perform tail call optimization that do not 2566 // require ABI changes. This is what gcc calls sibcall. 2567 2568 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2569 // emit a special epilogue. 2570 if (RegInfo->needsStackRealignment(MF)) 2571 return false; 2572 2573 // Also avoid sibcall optimization if either caller or callee uses struct 2574 // return semantics. 2575 if (isCalleeStructRet || isCallerStructRet) 2576 return false; 2577 2578 // An stdcall caller is expected to clean up its arguments; the callee 2579 // isn't going to do that. 2580 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2581 return false; 2582 2583 // Do not sibcall optimize vararg calls unless all arguments are passed via 2584 // registers. 2585 if (isVarArg && !Outs.empty()) { 2586 2587 // Optimizing for varargs on Win64 is unlikely to be safe without 2588 // additional testing. 2589 if (Subtarget->isTargetWin64()) 2590 return false; 2591 2592 SmallVector<CCValAssign, 16> ArgLocs; 2593 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2594 getTargetMachine(), ArgLocs, *DAG.getContext()); 2595 2596 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2597 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2598 if (!ArgLocs[i].isRegLoc()) 2599 return false; 2600 } 2601 2602 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2603 // Therefore if it's not used by the call it is not safe to optimize this into 2604 // a sibcall. 2605 bool Unused = false; 2606 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2607 if (!Ins[i].Used) { 2608 Unused = true; 2609 break; 2610 } 2611 } 2612 if (Unused) { 2613 SmallVector<CCValAssign, 16> RVLocs; 2614 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2615 getTargetMachine(), RVLocs, *DAG.getContext()); 2616 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2617 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2618 CCValAssign &VA = RVLocs[i]; 2619 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2620 return false; 2621 } 2622 } 2623 2624 // If the calling conventions do not match, then we'd better make sure the 2625 // results are returned in the same way as what the caller expects. 2626 if (!CCMatch) { 2627 SmallVector<CCValAssign, 16> RVLocs1; 2628 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2629 getTargetMachine(), RVLocs1, *DAG.getContext()); 2630 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2631 2632 SmallVector<CCValAssign, 16> RVLocs2; 2633 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2634 getTargetMachine(), RVLocs2, *DAG.getContext()); 2635 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2636 2637 if (RVLocs1.size() != RVLocs2.size()) 2638 return false; 2639 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2640 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2641 return false; 2642 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2643 return false; 2644 if (RVLocs1[i].isRegLoc()) { 2645 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2646 return false; 2647 } else { 2648 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2649 return false; 2650 } 2651 } 2652 } 2653 2654 // If the callee takes no arguments then go on to check the results of the 2655 // call. 2656 if (!Outs.empty()) { 2657 // Check if stack adjustment is needed. For now, do not do this if any 2658 // argument is passed on the stack. 2659 SmallVector<CCValAssign, 16> ArgLocs; 2660 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2661 getTargetMachine(), ArgLocs, *DAG.getContext()); 2662 2663 // Allocate shadow area for Win64 2664 if (Subtarget->isTargetWin64()) { 2665 CCInfo.AllocateStack(32, 8); 2666 } 2667 2668 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2669 if (CCInfo.getNextStackOffset()) { 2670 MachineFunction &MF = DAG.getMachineFunction(); 2671 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2672 return false; 2673 2674 // Check if the arguments are already laid out in the right way as 2675 // the caller's fixed stack objects. 2676 MachineFrameInfo *MFI = MF.getFrameInfo(); 2677 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2678 const X86InstrInfo *TII = 2679 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2680 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2681 CCValAssign &VA = ArgLocs[i]; 2682 SDValue Arg = OutVals[i]; 2683 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2684 if (VA.getLocInfo() == CCValAssign::Indirect) 2685 return false; 2686 if (!VA.isRegLoc()) { 2687 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2688 MFI, MRI, TII)) 2689 return false; 2690 } 2691 } 2692 } 2693 2694 // If the tailcall address may be in a register, then make sure it's 2695 // possible to register allocate for it. In 32-bit, the call address can 2696 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2697 // callee-saved registers are restored. These happen to be the same 2698 // registers used to pass 'inreg' arguments so watch out for those. 2699 if (!Subtarget->is64Bit() && 2700 !isa<GlobalAddressSDNode>(Callee) && 2701 !isa<ExternalSymbolSDNode>(Callee)) { 2702 unsigned NumInRegs = 0; 2703 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2704 CCValAssign &VA = ArgLocs[i]; 2705 if (!VA.isRegLoc()) 2706 continue; 2707 unsigned Reg = VA.getLocReg(); 2708 switch (Reg) { 2709 default: break; 2710 case X86::EAX: case X86::EDX: case X86::ECX: 2711 if (++NumInRegs == 3) 2712 return false; 2713 break; 2714 } 2715 } 2716 } 2717 } 2718 2719 return true; 2720} 2721 2722FastISel * 2723X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2724 return X86::createFastISel(funcInfo); 2725} 2726 2727 2728//===----------------------------------------------------------------------===// 2729// Other Lowering Hooks 2730//===----------------------------------------------------------------------===// 2731 2732static bool MayFoldLoad(SDValue Op) { 2733 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2734} 2735 2736static bool MayFoldIntoStore(SDValue Op) { 2737 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2738} 2739 2740static bool isTargetShuffle(unsigned Opcode) { 2741 switch(Opcode) { 2742 default: return false; 2743 case X86ISD::PSHUFD: 2744 case X86ISD::PSHUFHW: 2745 case X86ISD::PSHUFLW: 2746 case X86ISD::SHUFPD: 2747 case X86ISD::PALIGN: 2748 case X86ISD::SHUFPS: 2749 case X86ISD::MOVLHPS: 2750 case X86ISD::MOVLHPD: 2751 case X86ISD::MOVHLPS: 2752 case X86ISD::MOVLPS: 2753 case X86ISD::MOVLPD: 2754 case X86ISD::MOVSHDUP: 2755 case X86ISD::MOVSLDUP: 2756 case X86ISD::MOVDDUP: 2757 case X86ISD::MOVSS: 2758 case X86ISD::MOVSD: 2759 case X86ISD::UNPCKLPS: 2760 case X86ISD::UNPCKLPD: 2761 case X86ISD::VUNPCKLPSY: 2762 case X86ISD::VUNPCKLPDY: 2763 case X86ISD::PUNPCKLWD: 2764 case X86ISD::PUNPCKLBW: 2765 case X86ISD::PUNPCKLDQ: 2766 case X86ISD::PUNPCKLQDQ: 2767 case X86ISD::UNPCKHPS: 2768 case X86ISD::UNPCKHPD: 2769 case X86ISD::VUNPCKHPSY: 2770 case X86ISD::VUNPCKHPDY: 2771 case X86ISD::PUNPCKHWD: 2772 case X86ISD::PUNPCKHBW: 2773 case X86ISD::PUNPCKHDQ: 2774 case X86ISD::PUNPCKHQDQ: 2775 case X86ISD::VPERMILPS: 2776 case X86ISD::VPERMILPSY: 2777 case X86ISD::VPERMILPD: 2778 case X86ISD::VPERMILPDY: 2779 case X86ISD::VPERM2F128: 2780 return true; 2781 } 2782 return false; 2783} 2784 2785static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2786 SDValue V1, SelectionDAG &DAG) { 2787 switch(Opc) { 2788 default: llvm_unreachable("Unknown x86 shuffle node"); 2789 case X86ISD::MOVSHDUP: 2790 case X86ISD::MOVSLDUP: 2791 case X86ISD::MOVDDUP: 2792 return DAG.getNode(Opc, dl, VT, V1); 2793 } 2794 2795 return SDValue(); 2796} 2797 2798static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2799 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2800 switch(Opc) { 2801 default: llvm_unreachable("Unknown x86 shuffle node"); 2802 case X86ISD::PSHUFD: 2803 case X86ISD::PSHUFHW: 2804 case X86ISD::PSHUFLW: 2805 case X86ISD::VPERMILPS: 2806 case X86ISD::VPERMILPSY: 2807 case X86ISD::VPERMILPD: 2808 case X86ISD::VPERMILPDY: 2809 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2810 } 2811 2812 return SDValue(); 2813} 2814 2815static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2816 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2817 switch(Opc) { 2818 default: llvm_unreachable("Unknown x86 shuffle node"); 2819 case X86ISD::PALIGN: 2820 case X86ISD::SHUFPD: 2821 case X86ISD::SHUFPS: 2822 case X86ISD::VPERM2F128: 2823 return DAG.getNode(Opc, dl, VT, V1, V2, 2824 DAG.getConstant(TargetMask, MVT::i8)); 2825 } 2826 return SDValue(); 2827} 2828 2829static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2830 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2831 switch(Opc) { 2832 default: llvm_unreachable("Unknown x86 shuffle node"); 2833 case X86ISD::MOVLHPS: 2834 case X86ISD::MOVLHPD: 2835 case X86ISD::MOVHLPS: 2836 case X86ISD::MOVLPS: 2837 case X86ISD::MOVLPD: 2838 case X86ISD::MOVSS: 2839 case X86ISD::MOVSD: 2840 case X86ISD::UNPCKLPS: 2841 case X86ISD::UNPCKLPD: 2842 case X86ISD::VUNPCKLPSY: 2843 case X86ISD::VUNPCKLPDY: 2844 case X86ISD::PUNPCKLWD: 2845 case X86ISD::PUNPCKLBW: 2846 case X86ISD::PUNPCKLDQ: 2847 case X86ISD::PUNPCKLQDQ: 2848 case X86ISD::UNPCKHPS: 2849 case X86ISD::UNPCKHPD: 2850 case X86ISD::VUNPCKHPSY: 2851 case X86ISD::VUNPCKHPDY: 2852 case X86ISD::PUNPCKHWD: 2853 case X86ISD::PUNPCKHBW: 2854 case X86ISD::PUNPCKHDQ: 2855 case X86ISD::PUNPCKHQDQ: 2856 return DAG.getNode(Opc, dl, VT, V1, V2); 2857 } 2858 return SDValue(); 2859} 2860 2861SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2862 MachineFunction &MF = DAG.getMachineFunction(); 2863 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2864 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2865 2866 if (ReturnAddrIndex == 0) { 2867 // Set up a frame object for the return address. 2868 uint64_t SlotSize = TD->getPointerSize(); 2869 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2870 false); 2871 FuncInfo->setRAIndex(ReturnAddrIndex); 2872 } 2873 2874 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2875} 2876 2877 2878bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2879 bool hasSymbolicDisplacement) { 2880 // Offset should fit into 32 bit immediate field. 2881 if (!isInt<32>(Offset)) 2882 return false; 2883 2884 // If we don't have a symbolic displacement - we don't have any extra 2885 // restrictions. 2886 if (!hasSymbolicDisplacement) 2887 return true; 2888 2889 // FIXME: Some tweaks might be needed for medium code model. 2890 if (M != CodeModel::Small && M != CodeModel::Kernel) 2891 return false; 2892 2893 // For small code model we assume that latest object is 16MB before end of 31 2894 // bits boundary. We may also accept pretty large negative constants knowing 2895 // that all objects are in the positive half of address space. 2896 if (M == CodeModel::Small && Offset < 16*1024*1024) 2897 return true; 2898 2899 // For kernel code model we know that all object resist in the negative half 2900 // of 32bits address space. We may not accept negative offsets, since they may 2901 // be just off and we may accept pretty large positive ones. 2902 if (M == CodeModel::Kernel && Offset > 0) 2903 return true; 2904 2905 return false; 2906} 2907 2908/// isCalleePop - Determines whether the callee is required to pop its 2909/// own arguments. Callee pop is necessary to support tail calls. 2910bool X86::isCalleePop(CallingConv::ID CallingConv, 2911 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 2912 if (IsVarArg) 2913 return false; 2914 2915 switch (CallingConv) { 2916 default: 2917 return false; 2918 case CallingConv::X86_StdCall: 2919 return !is64Bit; 2920 case CallingConv::X86_FastCall: 2921 return !is64Bit; 2922 case CallingConv::X86_ThisCall: 2923 return !is64Bit; 2924 case CallingConv::Fast: 2925 return TailCallOpt; 2926 case CallingConv::GHC: 2927 return TailCallOpt; 2928 } 2929} 2930 2931/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2932/// specific condition code, returning the condition code and the LHS/RHS of the 2933/// comparison to make. 2934static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2935 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2936 if (!isFP) { 2937 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2938 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2939 // X > -1 -> X == 0, jump !sign. 2940 RHS = DAG.getConstant(0, RHS.getValueType()); 2941 return X86::COND_NS; 2942 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2943 // X < 0 -> X == 0, jump on sign. 2944 return X86::COND_S; 2945 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2946 // X < 1 -> X <= 0 2947 RHS = DAG.getConstant(0, RHS.getValueType()); 2948 return X86::COND_LE; 2949 } 2950 } 2951 2952 switch (SetCCOpcode) { 2953 default: llvm_unreachable("Invalid integer condition!"); 2954 case ISD::SETEQ: return X86::COND_E; 2955 case ISD::SETGT: return X86::COND_G; 2956 case ISD::SETGE: return X86::COND_GE; 2957 case ISD::SETLT: return X86::COND_L; 2958 case ISD::SETLE: return X86::COND_LE; 2959 case ISD::SETNE: return X86::COND_NE; 2960 case ISD::SETULT: return X86::COND_B; 2961 case ISD::SETUGT: return X86::COND_A; 2962 case ISD::SETULE: return X86::COND_BE; 2963 case ISD::SETUGE: return X86::COND_AE; 2964 } 2965 } 2966 2967 // First determine if it is required or is profitable to flip the operands. 2968 2969 // If LHS is a foldable load, but RHS is not, flip the condition. 2970 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2971 !ISD::isNON_EXTLoad(RHS.getNode())) { 2972 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2973 std::swap(LHS, RHS); 2974 } 2975 2976 switch (SetCCOpcode) { 2977 default: break; 2978 case ISD::SETOLT: 2979 case ISD::SETOLE: 2980 case ISD::SETUGT: 2981 case ISD::SETUGE: 2982 std::swap(LHS, RHS); 2983 break; 2984 } 2985 2986 // On a floating point condition, the flags are set as follows: 2987 // ZF PF CF op 2988 // 0 | 0 | 0 | X > Y 2989 // 0 | 0 | 1 | X < Y 2990 // 1 | 0 | 0 | X == Y 2991 // 1 | 1 | 1 | unordered 2992 switch (SetCCOpcode) { 2993 default: llvm_unreachable("Condcode should be pre-legalized away"); 2994 case ISD::SETUEQ: 2995 case ISD::SETEQ: return X86::COND_E; 2996 case ISD::SETOLT: // flipped 2997 case ISD::SETOGT: 2998 case ISD::SETGT: return X86::COND_A; 2999 case ISD::SETOLE: // flipped 3000 case ISD::SETOGE: 3001 case ISD::SETGE: return X86::COND_AE; 3002 case ISD::SETUGT: // flipped 3003 case ISD::SETULT: 3004 case ISD::SETLT: return X86::COND_B; 3005 case ISD::SETUGE: // flipped 3006 case ISD::SETULE: 3007 case ISD::SETLE: return X86::COND_BE; 3008 case ISD::SETONE: 3009 case ISD::SETNE: return X86::COND_NE; 3010 case ISD::SETUO: return X86::COND_P; 3011 case ISD::SETO: return X86::COND_NP; 3012 case ISD::SETOEQ: 3013 case ISD::SETUNE: return X86::COND_INVALID; 3014 } 3015} 3016 3017/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3018/// code. Current x86 isa includes the following FP cmov instructions: 3019/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3020static bool hasFPCMov(unsigned X86CC) { 3021 switch (X86CC) { 3022 default: 3023 return false; 3024 case X86::COND_B: 3025 case X86::COND_BE: 3026 case X86::COND_E: 3027 case X86::COND_P: 3028 case X86::COND_A: 3029 case X86::COND_AE: 3030 case X86::COND_NE: 3031 case X86::COND_NP: 3032 return true; 3033 } 3034} 3035 3036/// isFPImmLegal - Returns true if the target can instruction select the 3037/// specified FP immediate natively. If false, the legalizer will 3038/// materialize the FP immediate as a load from a constant pool. 3039bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3040 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3041 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3042 return true; 3043 } 3044 return false; 3045} 3046 3047/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3048/// the specified range (L, H]. 3049static bool isUndefOrInRange(int Val, int Low, int Hi) { 3050 return (Val < 0) || (Val >= Low && Val < Hi); 3051} 3052 3053/// isUndefOrInRange - Return true if every element in Mask, begining 3054/// from position Pos and ending in Pos+Size, falls within the specified 3055/// range (L, L+Pos]. or is undef. 3056static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, 3057 int Pos, int Size, int Low, int Hi) { 3058 for (int i = Pos, e = Pos+Size; i != e; ++i) 3059 if (!isUndefOrInRange(Mask[i], Low, Hi)) 3060 return false; 3061 return true; 3062} 3063 3064/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3065/// specified value. 3066static bool isUndefOrEqual(int Val, int CmpVal) { 3067 if (Val < 0 || Val == CmpVal) 3068 return true; 3069 return false; 3070} 3071 3072/// isSequentialOrUndefInRange - Return true if every element in Mask, begining 3073/// from position Pos and ending in Pos+Size, falls within the specified 3074/// sequential range (L, L+Pos]. or is undef. 3075static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, 3076 int Pos, int Size, int Low) { 3077 for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3078 if (!isUndefOrEqual(Mask[i], Low)) 3079 return false; 3080 return true; 3081} 3082 3083/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3084/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3085/// the second operand. 3086static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3087 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3088 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3089 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3090 return (Mask[0] < 2 && Mask[1] < 2); 3091 return false; 3092} 3093 3094bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3095 SmallVector<int, 8> M; 3096 N->getMask(M); 3097 return ::isPSHUFDMask(M, N->getValueType(0)); 3098} 3099 3100/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3101/// is suitable for input to PSHUFHW. 3102static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3103 if (VT != MVT::v8i16) 3104 return false; 3105 3106 // Lower quadword copied in order or undef. 3107 for (int i = 0; i != 4; ++i) 3108 if (Mask[i] >= 0 && Mask[i] != i) 3109 return false; 3110 3111 // Upper quadword shuffled. 3112 for (int i = 4; i != 8; ++i) 3113 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3114 return false; 3115 3116 return true; 3117} 3118 3119bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3120 SmallVector<int, 8> M; 3121 N->getMask(M); 3122 return ::isPSHUFHWMask(M, N->getValueType(0)); 3123} 3124 3125/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3126/// is suitable for input to PSHUFLW. 3127static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3128 if (VT != MVT::v8i16) 3129 return false; 3130 3131 // Upper quadword copied in order. 3132 for (int i = 4; i != 8; ++i) 3133 if (Mask[i] >= 0 && Mask[i] != i) 3134 return false; 3135 3136 // Lower quadword shuffled. 3137 for (int i = 0; i != 4; ++i) 3138 if (Mask[i] >= 4) 3139 return false; 3140 3141 return true; 3142} 3143 3144bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3145 SmallVector<int, 8> M; 3146 N->getMask(M); 3147 return ::isPSHUFLWMask(M, N->getValueType(0)); 3148} 3149 3150/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3151/// is suitable for input to PALIGNR. 3152static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3153 bool hasSSSE3) { 3154 int i, e = VT.getVectorNumElements(); 3155 if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) 3156 return false; 3157 3158 // Do not handle v2i64 / v2f64 shuffles with palignr. 3159 if (e < 4 || !hasSSSE3) 3160 return false; 3161 3162 for (i = 0; i != e; ++i) 3163 if (Mask[i] >= 0) 3164 break; 3165 3166 // All undef, not a palignr. 3167 if (i == e) 3168 return false; 3169 3170 // Make sure we're shifting in the right direction. 3171 if (Mask[i] <= i) 3172 return false; 3173 3174 int s = Mask[i] - i; 3175 3176 // Check the rest of the elements to see if they are consecutive. 3177 for (++i; i != e; ++i) { 3178 int m = Mask[i]; 3179 if (m >= 0 && m != s+i) 3180 return false; 3181 } 3182 return true; 3183} 3184 3185/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand 3186/// specifies a shuffle of elements that is suitable for input to 256-bit 3187/// VSHUFPSY. 3188static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, 3189 const X86Subtarget *Subtarget) { 3190 int NumElems = VT.getVectorNumElements(); 3191 3192 if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) 3193 return false; 3194 3195 if (NumElems != 8) 3196 return false; 3197 3198 // VSHUFPSY divides the resulting vector into 4 chunks. 3199 // The sources are also splitted into 4 chunks, and each destination 3200 // chunk must come from a different source chunk. 3201 // 3202 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3203 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3204 // 3205 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3206 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3207 // 3208 int QuarterSize = NumElems/4; 3209 int HalfSize = QuarterSize*2; 3210 for (int i = 0; i < QuarterSize; ++i) 3211 if (!isUndefOrInRange(Mask[i], 0, HalfSize)) 3212 return false; 3213 for (int i = QuarterSize; i < QuarterSize*2; ++i) 3214 if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) 3215 return false; 3216 3217 // The mask of the second half must be the same as the first but with 3218 // the appropriate offsets. This works in the same way as VPERMILPS 3219 // works with masks. 3220 for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { 3221 if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) 3222 return false; 3223 int FstHalfIdx = i-HalfSize; 3224 if (Mask[FstHalfIdx] < 0) 3225 continue; 3226 if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) 3227 return false; 3228 } 3229 for (int i = QuarterSize*3; i < NumElems; ++i) { 3230 if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) 3231 return false; 3232 int FstHalfIdx = i-HalfSize; 3233 if (Mask[FstHalfIdx] < 0) 3234 continue; 3235 if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) 3236 return false; 3237 3238 } 3239 3240 return true; 3241} 3242 3243/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle 3244/// the specified VECTOR_MASK mask with VSHUFPSY instruction. 3245static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { 3246 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3247 EVT VT = SVOp->getValueType(0); 3248 int NumElems = VT.getVectorNumElements(); 3249 3250 assert(NumElems == 8 && VT.getSizeInBits() == 256 && 3251 "Only supports v8i32 and v8f32 types"); 3252 3253 int HalfSize = NumElems/2; 3254 unsigned Mask = 0; 3255 for (int i = 0; i != NumElems ; ++i) { 3256 if (SVOp->getMaskElt(i) < 0) 3257 continue; 3258 // The mask of the first half must be equal to the second one. 3259 unsigned Shamt = (i%HalfSize)*2; 3260 unsigned Elt = SVOp->getMaskElt(i) % HalfSize; 3261 Mask |= Elt << Shamt; 3262 } 3263 3264 return Mask; 3265} 3266 3267/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand 3268/// specifies a shuffle of elements that is suitable for input to 256-bit 3269/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS 3270/// version and the mask of the second half isn't binded with the first 3271/// one. 3272static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, 3273 const X86Subtarget *Subtarget) { 3274 int NumElems = VT.getVectorNumElements(); 3275 3276 if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) 3277 return false; 3278 3279 if (NumElems != 4) 3280 return false; 3281 3282 // VSHUFPSY divides the resulting vector into 4 chunks. 3283 // The sources are also splitted into 4 chunks, and each destination 3284 // chunk must come from a different source chunk. 3285 // 3286 // SRC1 => X3 X2 X1 X0 3287 // SRC2 => Y3 Y2 Y1 Y0 3288 // 3289 // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 3290 // 3291 int QuarterSize = NumElems/4; 3292 int HalfSize = QuarterSize*2; 3293 for (int i = 0; i < QuarterSize; ++i) 3294 if (!isUndefOrInRange(Mask[i], 0, HalfSize)) 3295 return false; 3296 for (int i = QuarterSize; i < QuarterSize*2; ++i) 3297 if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) 3298 return false; 3299 for (int i = QuarterSize*2; i < QuarterSize*3; ++i) 3300 if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) 3301 return false; 3302 for (int i = QuarterSize*3; i < NumElems; ++i) 3303 if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) 3304 return false; 3305 3306 return true; 3307} 3308 3309/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle 3310/// the specified VECTOR_MASK mask with VSHUFPDY instruction. 3311static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { 3312 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3313 EVT VT = SVOp->getValueType(0); 3314 int NumElems = VT.getVectorNumElements(); 3315 3316 assert(NumElems == 4 && VT.getSizeInBits() == 256 && 3317 "Only supports v4i64 and v4f64 types"); 3318 3319 int HalfSize = NumElems/2; 3320 unsigned Mask = 0; 3321 for (int i = 0; i != NumElems ; ++i) { 3322 if (SVOp->getMaskElt(i) < 0) 3323 continue; 3324 int Elt = SVOp->getMaskElt(i) % HalfSize; 3325 Mask |= Elt << i; 3326 } 3327 3328 return Mask; 3329} 3330 3331/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3332/// specifies a shuffle of elements that is suitable for input to 128-bit 3333/// SHUFPS and SHUFPD. 3334static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3335 int NumElems = VT.getVectorNumElements(); 3336 3337 if (VT.getSizeInBits() != 128) 3338 return false; 3339 3340 if (NumElems != 2 && NumElems != 4) 3341 return false; 3342 3343 int Half = NumElems / 2; 3344 for (int i = 0; i < Half; ++i) 3345 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3346 return false; 3347 for (int i = Half; i < NumElems; ++i) 3348 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3349 return false; 3350 3351 return true; 3352} 3353 3354bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3355 SmallVector<int, 8> M; 3356 N->getMask(M); 3357 return ::isSHUFPMask(M, N->getValueType(0)); 3358} 3359 3360/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3361/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3362/// half elements to come from vector 1 (which would equal the dest.) and 3363/// the upper half to come from vector 2. 3364static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3365 int NumElems = VT.getVectorNumElements(); 3366 3367 if (NumElems != 2 && NumElems != 4) 3368 return false; 3369 3370 int Half = NumElems / 2; 3371 for (int i = 0; i < Half; ++i) 3372 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3373 return false; 3374 for (int i = Half; i < NumElems; ++i) 3375 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3376 return false; 3377 return true; 3378} 3379 3380static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3381 SmallVector<int, 8> M; 3382 N->getMask(M); 3383 return isCommutedSHUFPMask(M, N->getValueType(0)); 3384} 3385 3386/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3387/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3388bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3389 EVT VT = N->getValueType(0); 3390 unsigned NumElems = VT.getVectorNumElements(); 3391 3392 if (VT.getSizeInBits() != 128) 3393 return false; 3394 3395 if (NumElems != 4) 3396 return false; 3397 3398 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3399 return isUndefOrEqual(N->getMaskElt(0), 6) && 3400 isUndefOrEqual(N->getMaskElt(1), 7) && 3401 isUndefOrEqual(N->getMaskElt(2), 2) && 3402 isUndefOrEqual(N->getMaskElt(3), 3); 3403} 3404 3405/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3406/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3407/// <2, 3, 2, 3> 3408bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3409 EVT VT = N->getValueType(0); 3410 unsigned NumElems = VT.getVectorNumElements(); 3411 3412 if (VT.getSizeInBits() != 128) 3413 return false; 3414 3415 if (NumElems != 4) 3416 return false; 3417 3418 return isUndefOrEqual(N->getMaskElt(0), 2) && 3419 isUndefOrEqual(N->getMaskElt(1), 3) && 3420 isUndefOrEqual(N->getMaskElt(2), 2) && 3421 isUndefOrEqual(N->getMaskElt(3), 3); 3422} 3423 3424/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3425/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3426bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3427 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3428 3429 if (NumElems != 2 && NumElems != 4) 3430 return false; 3431 3432 for (unsigned i = 0; i < NumElems/2; ++i) 3433 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3434 return false; 3435 3436 for (unsigned i = NumElems/2; i < NumElems; ++i) 3437 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3438 return false; 3439 3440 return true; 3441} 3442 3443/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3444/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3445bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3446 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3447 3448 if ((NumElems != 2 && NumElems != 4) 3449 || N->getValueType(0).getSizeInBits() > 128) 3450 return false; 3451 3452 for (unsigned i = 0; i < NumElems/2; ++i) 3453 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3454 return false; 3455 3456 for (unsigned i = 0; i < NumElems/2; ++i) 3457 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3458 return false; 3459 3460 return true; 3461} 3462 3463/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3464/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3465static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3466 bool V2IsSplat = false) { 3467 int NumElts = VT.getVectorNumElements(); 3468 3469 assert((VT.is128BitVector() || VT.is256BitVector()) && 3470 "Unsupported vector type for unpckh"); 3471 3472 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3473 return false; 3474 3475 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3476 // independently on 128-bit lanes. 3477 unsigned NumLanes = VT.getSizeInBits()/128; 3478 unsigned NumLaneElts = NumElts/NumLanes; 3479 3480 unsigned Start = 0; 3481 unsigned End = NumLaneElts; 3482 for (unsigned s = 0; s < NumLanes; ++s) { 3483 for (unsigned i = Start, j = s * NumLaneElts; 3484 i != End; 3485 i += 2, ++j) { 3486 int BitI = Mask[i]; 3487 int BitI1 = Mask[i+1]; 3488 if (!isUndefOrEqual(BitI, j)) 3489 return false; 3490 if (V2IsSplat) { 3491 if (!isUndefOrEqual(BitI1, NumElts)) 3492 return false; 3493 } else { 3494 if (!isUndefOrEqual(BitI1, j + NumElts)) 3495 return false; 3496 } 3497 } 3498 // Process the next 128 bits. 3499 Start += NumLaneElts; 3500 End += NumLaneElts; 3501 } 3502 3503 return true; 3504} 3505 3506bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3507 SmallVector<int, 8> M; 3508 N->getMask(M); 3509 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3510} 3511 3512/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3513/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3514static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3515 bool V2IsSplat = false) { 3516 int NumElts = VT.getVectorNumElements(); 3517 3518 assert((VT.is128BitVector() || VT.is256BitVector()) && 3519 "Unsupported vector type for unpckh"); 3520 3521 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3522 return false; 3523 3524 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3525 // independently on 128-bit lanes. 3526 unsigned NumLanes = VT.getSizeInBits()/128; 3527 unsigned NumLaneElts = NumElts/NumLanes; 3528 3529 unsigned Start = 0; 3530 unsigned End = NumLaneElts; 3531 for (unsigned l = 0; l != NumLanes; ++l) { 3532 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; 3533 i != End; i += 2, ++j) { 3534 int BitI = Mask[i]; 3535 int BitI1 = Mask[i+1]; 3536 if (!isUndefOrEqual(BitI, j)) 3537 return false; 3538 if (V2IsSplat) { 3539 if (isUndefOrEqual(BitI1, NumElts)) 3540 return false; 3541 } else { 3542 if (!isUndefOrEqual(BitI1, j+NumElts)) 3543 return false; 3544 } 3545 } 3546 // Process the next 128 bits. 3547 Start += NumLaneElts; 3548 End += NumLaneElts; 3549 } 3550 return true; 3551} 3552 3553bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3554 SmallVector<int, 8> M; 3555 N->getMask(M); 3556 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3557} 3558 3559/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3560/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3561/// <0, 0, 1, 1> 3562static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3563 int NumElems = VT.getVectorNumElements(); 3564 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3565 return false; 3566 3567 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3568 // FIXME: Need a better way to get rid of this, there's no latency difference 3569 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3570 // the former later. We should also remove the "_undef" special mask. 3571 if (NumElems == 4 && VT.getSizeInBits() == 256) 3572 return false; 3573 3574 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3575 // independently on 128-bit lanes. 3576 unsigned NumLanes = VT.getSizeInBits() / 128; 3577 unsigned NumLaneElts = NumElems / NumLanes; 3578 3579 for (unsigned s = 0; s < NumLanes; ++s) { 3580 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; 3581 i != NumLaneElts * (s + 1); 3582 i += 2, ++j) { 3583 int BitI = Mask[i]; 3584 int BitI1 = Mask[i+1]; 3585 3586 if (!isUndefOrEqual(BitI, j)) 3587 return false; 3588 if (!isUndefOrEqual(BitI1, j)) 3589 return false; 3590 } 3591 } 3592 3593 return true; 3594} 3595 3596bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3597 SmallVector<int, 8> M; 3598 N->getMask(M); 3599 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3600} 3601 3602/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3603/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3604/// <2, 2, 3, 3> 3605static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3606 int NumElems = VT.getVectorNumElements(); 3607 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3608 return false; 3609 3610 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3611 int BitI = Mask[i]; 3612 int BitI1 = Mask[i+1]; 3613 if (!isUndefOrEqual(BitI, j)) 3614 return false; 3615 if (!isUndefOrEqual(BitI1, j)) 3616 return false; 3617 } 3618 return true; 3619} 3620 3621bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3622 SmallVector<int, 8> M; 3623 N->getMask(M); 3624 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3625} 3626 3627/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3628/// specifies a shuffle of elements that is suitable for input to MOVSS, 3629/// MOVSD, and MOVD, i.e. setting the lowest element. 3630static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3631 if (VT.getVectorElementType().getSizeInBits() < 32) 3632 return false; 3633 3634 int NumElts = VT.getVectorNumElements(); 3635 3636 if (!isUndefOrEqual(Mask[0], NumElts)) 3637 return false; 3638 3639 for (int i = 1; i < NumElts; ++i) 3640 if (!isUndefOrEqual(Mask[i], i)) 3641 return false; 3642 3643 return true; 3644} 3645 3646bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3647 SmallVector<int, 8> M; 3648 N->getMask(M); 3649 return ::isMOVLMask(M, N->getValueType(0)); 3650} 3651 3652/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered 3653/// as permutations between 128-bit chunks or halves. As an example: this 3654/// shuffle bellow: 3655/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3656/// The first half comes from the second half of V1 and the second half from the 3657/// the second half of V2. 3658static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, 3659 const X86Subtarget *Subtarget) { 3660 if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) 3661 return false; 3662 3663 // The shuffle result is divided into half A and half B. In total the two 3664 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3665 // B must come from C, D, E or F. 3666 int HalfSize = VT.getVectorNumElements()/2; 3667 bool MatchA = false, MatchB = false; 3668 3669 // Check if A comes from one of C, D, E, F. 3670 for (int Half = 0; Half < 4; ++Half) { 3671 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3672 MatchA = true; 3673 break; 3674 } 3675 } 3676 3677 // Check if B comes from one of C, D, E, F. 3678 for (int Half = 0; Half < 4; ++Half) { 3679 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3680 MatchB = true; 3681 break; 3682 } 3683 } 3684 3685 return MatchA && MatchB; 3686} 3687 3688/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle 3689/// the specified VECTOR_MASK mask with VPERM2F128 instructions. 3690static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { 3691 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3692 EVT VT = SVOp->getValueType(0); 3693 3694 int HalfSize = VT.getVectorNumElements()/2; 3695 3696 int FstHalf = 0, SndHalf = 0; 3697 for (int i = 0; i < HalfSize; ++i) { 3698 if (SVOp->getMaskElt(i) > 0) { 3699 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3700 break; 3701 } 3702 } 3703 for (int i = HalfSize; i < HalfSize*2; ++i) { 3704 if (SVOp->getMaskElt(i) > 0) { 3705 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3706 break; 3707 } 3708 } 3709 3710 return (FstHalf | (SndHalf << 4)); 3711} 3712 3713/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand 3714/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3715/// Note that VPERMIL mask matching is different depending whether theunderlying 3716/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3717/// to the same elements of the low, but to the higher half of the source. 3718/// In VPERMILPD the two lanes could be shuffled independently of each other 3719/// with the same restriction that lanes can't be crossed. 3720static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, 3721 const X86Subtarget *Subtarget) { 3722 int NumElts = VT.getVectorNumElements(); 3723 int NumLanes = VT.getSizeInBits()/128; 3724 3725 if (!Subtarget->hasAVX()) 3726 return false; 3727 3728 // Match any permutation of 128-bit vector with 64-bit types 3729 if (NumLanes == 1 && NumElts != 2) 3730 return false; 3731 3732 // Only match 256-bit with 32 types 3733 if (VT.getSizeInBits() == 256 && NumElts != 4) 3734 return false; 3735 3736 // The mask on the high lane is independent of the low. Both can match 3737 // any element in inside its own lane, but can't cross. 3738 int LaneSize = NumElts/NumLanes; 3739 for (int l = 0; l < NumLanes; ++l) 3740 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3741 int LaneStart = l*LaneSize; 3742 if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) 3743 return false; 3744 } 3745 3746 return true; 3747} 3748 3749/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand 3750/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. 3751/// Note that VPERMIL mask matching is different depending whether theunderlying 3752/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3753/// to the same elements of the low, but to the higher half of the source. 3754/// In VPERMILPD the two lanes could be shuffled independently of each other 3755/// with the same restriction that lanes can't be crossed. 3756static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, 3757 const X86Subtarget *Subtarget) { 3758 unsigned NumElts = VT.getVectorNumElements(); 3759 unsigned NumLanes = VT.getSizeInBits()/128; 3760 3761 if (!Subtarget->hasAVX()) 3762 return false; 3763 3764 // Match any permutation of 128-bit vector with 32-bit types 3765 if (NumLanes == 1 && NumElts != 4) 3766 return false; 3767 3768 // Only match 256-bit with 32 types 3769 if (VT.getSizeInBits() == 256 && NumElts != 8) 3770 return false; 3771 3772 // The mask on the high lane should be the same as the low. Actually, 3773 // they can differ if any of the corresponding index in a lane is undef 3774 // and the other stays in range. 3775 int LaneSize = NumElts/NumLanes; 3776 for (int i = 0; i < LaneSize; ++i) { 3777 int HighElt = i+LaneSize; 3778 bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); 3779 bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); 3780 3781 if (!HighValid || !LowValid) 3782 return false; 3783 if (Mask[i] < 0 || Mask[HighElt] < 0) 3784 continue; 3785 if (Mask[HighElt]-Mask[i] != LaneSize) 3786 return false; 3787 } 3788 3789 return true; 3790} 3791 3792/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle 3793/// the specified VECTOR_MASK mask with VPERMILPS* instructions. 3794static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { 3795 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3796 EVT VT = SVOp->getValueType(0); 3797 3798 int NumElts = VT.getVectorNumElements(); 3799 int NumLanes = VT.getSizeInBits()/128; 3800 int LaneSize = NumElts/NumLanes; 3801 3802 // Although the mask is equal for both lanes do it twice to get the cases 3803 // where a mask will match because the same mask element is undef on the 3804 // first half but valid on the second. This would get pathological cases 3805 // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. 3806 unsigned Mask = 0; 3807 for (int l = 0; l < NumLanes; ++l) { 3808 for (int i = 0; i < LaneSize; ++i) { 3809 int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); 3810 if (MaskElt < 0) 3811 continue; 3812 if (MaskElt >= LaneSize) 3813 MaskElt -= LaneSize; 3814 Mask |= MaskElt << (i*2); 3815 } 3816 } 3817 3818 return Mask; 3819} 3820 3821/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle 3822/// the specified VECTOR_MASK mask with VPERMILPD* instructions. 3823static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { 3824 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3825 EVT VT = SVOp->getValueType(0); 3826 3827 int NumElts = VT.getVectorNumElements(); 3828 int NumLanes = VT.getSizeInBits()/128; 3829 3830 unsigned Mask = 0; 3831 int LaneSize = NumElts/NumLanes; 3832 for (int l = 0; l < NumLanes; ++l) 3833 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3834 int MaskElt = SVOp->getMaskElt(i); 3835 if (MaskElt < 0) 3836 continue; 3837 Mask |= (MaskElt-l*LaneSize) << i; 3838 } 3839 3840 return Mask; 3841} 3842 3843/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3844/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3845/// element of vector 2 and the other elements to come from vector 1 in order. 3846static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3847 bool V2IsSplat = false, bool V2IsUndef = false) { 3848 int NumOps = VT.getVectorNumElements(); 3849 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3850 return false; 3851 3852 if (!isUndefOrEqual(Mask[0], 0)) 3853 return false; 3854 3855 for (int i = 1; i < NumOps; ++i) 3856 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3857 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3858 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3859 return false; 3860 3861 return true; 3862} 3863 3864static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3865 bool V2IsUndef = false) { 3866 SmallVector<int, 8> M; 3867 N->getMask(M); 3868 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3869} 3870 3871/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3872/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3873/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3874bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3875 const X86Subtarget *Subtarget) { 3876 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3877 return false; 3878 3879 // The second vector must be undef 3880 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3881 return false; 3882 3883 EVT VT = N->getValueType(0); 3884 unsigned NumElems = VT.getVectorNumElements(); 3885 3886 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3887 (VT.getSizeInBits() == 256 && NumElems != 8)) 3888 return false; 3889 3890 // "i+1" is the value the indexed mask element must have 3891 for (unsigned i = 0; i < NumElems; i += 2) 3892 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3893 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3894 return false; 3895 3896 return true; 3897} 3898 3899/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3900/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3901/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3902bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3903 const X86Subtarget *Subtarget) { 3904 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3905 return false; 3906 3907 // The second vector must be undef 3908 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3909 return false; 3910 3911 EVT VT = N->getValueType(0); 3912 unsigned NumElems = VT.getVectorNumElements(); 3913 3914 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3915 (VT.getSizeInBits() == 256 && NumElems != 8)) 3916 return false; 3917 3918 // "i" is the value the indexed mask element must have 3919 for (unsigned i = 0; i < NumElems; i += 2) 3920 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3921 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3922 return false; 3923 3924 return true; 3925} 3926 3927/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3928/// specifies a shuffle of elements that is suitable for input to 256-bit 3929/// version of MOVDDUP. 3930static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, 3931 const X86Subtarget *Subtarget) { 3932 EVT VT = N->getValueType(0); 3933 int NumElts = VT.getVectorNumElements(); 3934 bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; 3935 3936 if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || 3937 !V2IsUndef || NumElts != 4) 3938 return false; 3939 3940 for (int i = 0; i != NumElts/2; ++i) 3941 if (!isUndefOrEqual(N->getMaskElt(i), 0)) 3942 return false; 3943 for (int i = NumElts/2; i != NumElts; ++i) 3944 if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) 3945 return false; 3946 return true; 3947} 3948 3949/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3950/// specifies a shuffle of elements that is suitable for input to 128-bit 3951/// version of MOVDDUP. 3952bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3953 EVT VT = N->getValueType(0); 3954 3955 if (VT.getSizeInBits() != 128) 3956 return false; 3957 3958 int e = VT.getVectorNumElements() / 2; 3959 for (int i = 0; i < e; ++i) 3960 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3961 return false; 3962 for (int i = 0; i < e; ++i) 3963 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3964 return false; 3965 return true; 3966} 3967 3968/// isVEXTRACTF128Index - Return true if the specified 3969/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3970/// suitable for input to VEXTRACTF128. 3971bool X86::isVEXTRACTF128Index(SDNode *N) { 3972 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3973 return false; 3974 3975 // The index should be aligned on a 128-bit boundary. 3976 uint64_t Index = 3977 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3978 3979 unsigned VL = N->getValueType(0).getVectorNumElements(); 3980 unsigned VBits = N->getValueType(0).getSizeInBits(); 3981 unsigned ElSize = VBits / VL; 3982 bool Result = (Index * ElSize) % 128 == 0; 3983 3984 return Result; 3985} 3986 3987/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3988/// operand specifies a subvector insert that is suitable for input to 3989/// VINSERTF128. 3990bool X86::isVINSERTF128Index(SDNode *N) { 3991 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3992 return false; 3993 3994 // The index should be aligned on a 128-bit boundary. 3995 uint64_t Index = 3996 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3997 3998 unsigned VL = N->getValueType(0).getVectorNumElements(); 3999 unsigned VBits = N->getValueType(0).getSizeInBits(); 4000 unsigned ElSize = VBits / VL; 4001 bool Result = (Index * ElSize) % 128 == 0; 4002 4003 return Result; 4004} 4005 4006/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4007/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4008unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 4009 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 4010 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 4011 4012 unsigned Shift = (NumOperands == 4) ? 2 : 1; 4013 unsigned Mask = 0; 4014 for (int i = 0; i < NumOperands; ++i) { 4015 int Val = SVOp->getMaskElt(NumOperands-i-1); 4016 if (Val < 0) Val = 0; 4017 if (Val >= NumOperands) Val -= NumOperands; 4018 Mask |= Val; 4019 if (i != NumOperands - 1) 4020 Mask <<= Shift; 4021 } 4022 return Mask; 4023} 4024 4025/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4026/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4027unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 4028 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 4029 unsigned Mask = 0; 4030 // 8 nodes, but we only care about the last 4. 4031 for (unsigned i = 7; i >= 4; --i) { 4032 int Val = SVOp->getMaskElt(i); 4033 if (Val >= 0) 4034 Mask |= (Val - 4); 4035 if (i != 4) 4036 Mask <<= 2; 4037 } 4038 return Mask; 4039} 4040 4041/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4042/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4043unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 4044 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 4045 unsigned Mask = 0; 4046 // 8 nodes, but we only care about the first 4. 4047 for (int i = 3; i >= 0; --i) { 4048 int Val = SVOp->getMaskElt(i); 4049 if (Val >= 0) 4050 Mask |= Val; 4051 if (i != 0) 4052 Mask <<= 2; 4053 } 4054 return Mask; 4055} 4056 4057/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4058/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4059unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 4060 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 4061 EVT VVT = N->getValueType(0); 4062 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 4063 int Val = 0; 4064 4065 unsigned i, e; 4066 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 4067 Val = SVOp->getMaskElt(i); 4068 if (Val >= 0) 4069 break; 4070 } 4071 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4072 return (Val - i) * EltSize; 4073} 4074 4075/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4076/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4077/// instructions. 4078unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4079 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4080 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4081 4082 uint64_t Index = 4083 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4084 4085 EVT VecVT = N->getOperand(0).getValueType(); 4086 EVT ElVT = VecVT.getVectorElementType(); 4087 4088 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4089 return Index / NumElemsPerChunk; 4090} 4091 4092/// getInsertVINSERTF128Immediate - Return the appropriate immediate 4093/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4094/// instructions. 4095unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4096 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4097 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4098 4099 uint64_t Index = 4100 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4101 4102 EVT VecVT = N->getValueType(0); 4103 EVT ElVT = VecVT.getVectorElementType(); 4104 4105 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4106 return Index / NumElemsPerChunk; 4107} 4108 4109/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4110/// constant +0.0. 4111bool X86::isZeroNode(SDValue Elt) { 4112 return ((isa<ConstantSDNode>(Elt) && 4113 cast<ConstantSDNode>(Elt)->isNullValue()) || 4114 (isa<ConstantFPSDNode>(Elt) && 4115 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 4116} 4117 4118/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4119/// their permute mask. 4120static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4121 SelectionDAG &DAG) { 4122 EVT VT = SVOp->getValueType(0); 4123 unsigned NumElems = VT.getVectorNumElements(); 4124 SmallVector<int, 8> MaskVec; 4125 4126 for (unsigned i = 0; i != NumElems; ++i) { 4127 int idx = SVOp->getMaskElt(i); 4128 if (idx < 0) 4129 MaskVec.push_back(idx); 4130 else if (idx < (int)NumElems) 4131 MaskVec.push_back(idx + NumElems); 4132 else 4133 MaskVec.push_back(idx - NumElems); 4134 } 4135 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4136 SVOp->getOperand(0), &MaskVec[0]); 4137} 4138 4139/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 4140/// the two vector operands have swapped position. 4141static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 4142 unsigned NumElems = VT.getVectorNumElements(); 4143 for (unsigned i = 0; i != NumElems; ++i) { 4144 int idx = Mask[i]; 4145 if (idx < 0) 4146 continue; 4147 else if (idx < (int)NumElems) 4148 Mask[i] = idx + NumElems; 4149 else 4150 Mask[i] = idx - NumElems; 4151 } 4152} 4153 4154/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4155/// match movhlps. The lower half elements should come from upper half of 4156/// V1 (and in order), and the upper half elements should come from the upper 4157/// half of V2 (and in order). 4158static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 4159 EVT VT = Op->getValueType(0); 4160 if (VT.getSizeInBits() != 128) 4161 return false; 4162 if (VT.getVectorNumElements() != 4) 4163 return false; 4164 for (unsigned i = 0, e = 2; i != e; ++i) 4165 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 4166 return false; 4167 for (unsigned i = 2; i != 4; ++i) 4168 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 4169 return false; 4170 return true; 4171} 4172 4173/// isScalarLoadToVector - Returns true if the node is a scalar load that 4174/// is promoted to a vector. It also returns the LoadSDNode by reference if 4175/// required. 4176static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4177 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4178 return false; 4179 N = N->getOperand(0).getNode(); 4180 if (!ISD::isNON_EXTLoad(N)) 4181 return false; 4182 if (LD) 4183 *LD = cast<LoadSDNode>(N); 4184 return true; 4185} 4186 4187/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4188/// match movlp{s|d}. The lower half elements should come from lower half of 4189/// V1 (and in order), and the upper half elements should come from the upper 4190/// half of V2 (and in order). And since V1 will become the source of the 4191/// MOVLP, it must be either a vector load or a scalar load to vector. 4192static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4193 ShuffleVectorSDNode *Op) { 4194 EVT VT = Op->getValueType(0); 4195 if (VT.getSizeInBits() != 128) 4196 return false; 4197 4198 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4199 return false; 4200 // Is V2 is a vector load, don't do this transformation. We will try to use 4201 // load folding shufps op. 4202 if (ISD::isNON_EXTLoad(V2)) 4203 return false; 4204 4205 unsigned NumElems = VT.getVectorNumElements(); 4206 4207 if (NumElems != 2 && NumElems != 4) 4208 return false; 4209 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4210 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 4211 return false; 4212 for (unsigned i = NumElems/2; i != NumElems; ++i) 4213 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 4214 return false; 4215 return true; 4216} 4217 4218/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4219/// all the same. 4220static bool isSplatVector(SDNode *N) { 4221 if (N->getOpcode() != ISD::BUILD_VECTOR) 4222 return false; 4223 4224 SDValue SplatValue = N->getOperand(0); 4225 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4226 if (N->getOperand(i) != SplatValue) 4227 return false; 4228 return true; 4229} 4230 4231/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4232/// to an zero vector. 4233/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4234static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4235 SDValue V1 = N->getOperand(0); 4236 SDValue V2 = N->getOperand(1); 4237 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4238 for (unsigned i = 0; i != NumElems; ++i) { 4239 int Idx = N->getMaskElt(i); 4240 if (Idx >= (int)NumElems) { 4241 unsigned Opc = V2.getOpcode(); 4242 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4243 continue; 4244 if (Opc != ISD::BUILD_VECTOR || 4245 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4246 return false; 4247 } else if (Idx >= 0) { 4248 unsigned Opc = V1.getOpcode(); 4249 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4250 continue; 4251 if (Opc != ISD::BUILD_VECTOR || 4252 !X86::isZeroNode(V1.getOperand(Idx))) 4253 return false; 4254 } 4255 } 4256 return true; 4257} 4258 4259/// getZeroVector - Returns a vector of specified type with all zero elements. 4260/// 4261static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 4262 DebugLoc dl) { 4263 assert(VT.isVector() && "Expected a vector type"); 4264 4265 // Always build SSE zero vectors as <4 x i32> bitcasted 4266 // to their dest type. This ensures they get CSE'd. 4267 SDValue Vec; 4268 if (VT.getSizeInBits() == 128) { // SSE 4269 if (HasSSE2) { // SSE2 4270 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4271 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4272 } else { // SSE1 4273 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4274 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4275 } 4276 } else if (VT.getSizeInBits() == 256) { // AVX 4277 // 256-bit logic and arithmetic instructions in AVX are 4278 // all floating-point, no support for integer ops. Default 4279 // to emitting fp zeroed vectors then. 4280 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4281 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4282 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4283 } 4284 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4285} 4286 4287/// getOnesVector - Returns a vector of specified type with all bits set. 4288/// Always build ones vectors as <4 x i32>. For 256-bit types, use two 4289/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their 4290/// original type, ensuring they get CSE'd. 4291static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 4292 assert(VT.isVector() && "Expected a vector type"); 4293 assert((VT.is128BitVector() || VT.is256BitVector()) 4294 && "Expected a 128-bit or 256-bit vector type"); 4295 4296 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4297 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 4298 Cst, Cst, Cst, Cst); 4299 4300 if (VT.is256BitVector()) { 4301 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 4302 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 4303 Vec = Insert128BitVector(InsV, Vec, 4304 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 4305 } 4306 4307 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4308} 4309 4310/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4311/// that point to V2 points to its first element. 4312static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4313 EVT VT = SVOp->getValueType(0); 4314 unsigned NumElems = VT.getVectorNumElements(); 4315 4316 bool Changed = false; 4317 SmallVector<int, 8> MaskVec; 4318 SVOp->getMask(MaskVec); 4319 4320 for (unsigned i = 0; i != NumElems; ++i) { 4321 if (MaskVec[i] > (int)NumElems) { 4322 MaskVec[i] = NumElems; 4323 Changed = true; 4324 } 4325 } 4326 if (Changed) 4327 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 4328 SVOp->getOperand(1), &MaskVec[0]); 4329 return SDValue(SVOp, 0); 4330} 4331 4332/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4333/// operation of specified width. 4334static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4335 SDValue V2) { 4336 unsigned NumElems = VT.getVectorNumElements(); 4337 SmallVector<int, 8> Mask; 4338 Mask.push_back(NumElems); 4339 for (unsigned i = 1; i != NumElems; ++i) 4340 Mask.push_back(i); 4341 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4342} 4343 4344/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4345static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4346 SDValue V2) { 4347 unsigned NumElems = VT.getVectorNumElements(); 4348 SmallVector<int, 8> Mask; 4349 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4350 Mask.push_back(i); 4351 Mask.push_back(i + NumElems); 4352 } 4353 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4354} 4355 4356/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4357static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4358 SDValue V2) { 4359 unsigned NumElems = VT.getVectorNumElements(); 4360 unsigned Half = NumElems/2; 4361 SmallVector<int, 8> Mask; 4362 for (unsigned i = 0; i != Half; ++i) { 4363 Mask.push_back(i + Half); 4364 Mask.push_back(i + NumElems + Half); 4365 } 4366 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4367} 4368 4369// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4370// a generic shuffle instruction because the target has no such instructions. 4371// Generate shuffles which repeat i16 and i8 several times until they can be 4372// represented by v4f32 and then be manipulated by target suported shuffles. 4373static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4374 EVT VT = V.getValueType(); 4375 int NumElems = VT.getVectorNumElements(); 4376 DebugLoc dl = V.getDebugLoc(); 4377 4378 while (NumElems > 4) { 4379 if (EltNo < NumElems/2) { 4380 V = getUnpackl(DAG, dl, VT, V, V); 4381 } else { 4382 V = getUnpackh(DAG, dl, VT, V, V); 4383 EltNo -= NumElems/2; 4384 } 4385 NumElems >>= 1; 4386 } 4387 return V; 4388} 4389 4390/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4391static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4392 EVT VT = V.getValueType(); 4393 DebugLoc dl = V.getDebugLoc(); 4394 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4395 && "Vector size not supported"); 4396 4397 if (VT.getSizeInBits() == 128) { 4398 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4399 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4400 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4401 &SplatMask[0]); 4402 } else { 4403 // To use VPERMILPS to splat scalars, the second half of indicies must 4404 // refer to the higher part, which is a duplication of the lower one, 4405 // because VPERMILPS can only handle in-lane permutations. 4406 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4407 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4408 4409 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4410 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4411 &SplatMask[0]); 4412 } 4413 4414 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4415} 4416 4417/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4418static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4419 EVT SrcVT = SV->getValueType(0); 4420 SDValue V1 = SV->getOperand(0); 4421 DebugLoc dl = SV->getDebugLoc(); 4422 4423 int EltNo = SV->getSplatIndex(); 4424 int NumElems = SrcVT.getVectorNumElements(); 4425 unsigned Size = SrcVT.getSizeInBits(); 4426 4427 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4428 "Unknown how to promote splat for type"); 4429 4430 // Extract the 128-bit part containing the splat element and update 4431 // the splat element index when it refers to the higher register. 4432 if (Size == 256) { 4433 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4434 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4435 if (Idx > 0) 4436 EltNo -= NumElems/2; 4437 } 4438 4439 // All i16 and i8 vector types can't be used directly by a generic shuffle 4440 // instruction because the target has no such instruction. Generate shuffles 4441 // which repeat i16 and i8 several times until they fit in i32, and then can 4442 // be manipulated by target suported shuffles. 4443 EVT EltVT = SrcVT.getVectorElementType(); 4444 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4445 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4446 4447 // Recreate the 256-bit vector and place the same 128-bit vector 4448 // into the low and high part. This is necessary because we want 4449 // to use VPERM* to shuffle the vectors 4450 if (Size == 256) { 4451 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4452 DAG.getConstant(0, MVT::i32), DAG, dl); 4453 V1 = Insert128BitVector(InsV, V1, 4454 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4455 } 4456 4457 return getLegalSplat(DAG, V1, EltNo); 4458} 4459 4460/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4461/// vector of zero or undef vector. This produces a shuffle where the low 4462/// element of V2 is swizzled into the zero/undef vector, landing at element 4463/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4464static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4465 bool isZero, bool HasSSE2, 4466 SelectionDAG &DAG) { 4467 EVT VT = V2.getValueType(); 4468 SDValue V1 = isZero 4469 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4470 unsigned NumElems = VT.getVectorNumElements(); 4471 SmallVector<int, 16> MaskVec; 4472 for (unsigned i = 0; i != NumElems; ++i) 4473 // If this is the insertion idx, put the low elt of V2 here. 4474 MaskVec.push_back(i == Idx ? NumElems : i); 4475 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4476} 4477 4478/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4479/// element of the result of the vector shuffle. 4480static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4481 unsigned Depth) { 4482 if (Depth == 6) 4483 return SDValue(); // Limit search depth. 4484 4485 SDValue V = SDValue(N, 0); 4486 EVT VT = V.getValueType(); 4487 unsigned Opcode = V.getOpcode(); 4488 4489 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4490 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4491 Index = SV->getMaskElt(Index); 4492 4493 if (Index < 0) 4494 return DAG.getUNDEF(VT.getVectorElementType()); 4495 4496 int NumElems = VT.getVectorNumElements(); 4497 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4498 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4499 } 4500 4501 // Recurse into target specific vector shuffles to find scalars. 4502 if (isTargetShuffle(Opcode)) { 4503 int NumElems = VT.getVectorNumElements(); 4504 SmallVector<unsigned, 16> ShuffleMask; 4505 SDValue ImmN; 4506 4507 switch(Opcode) { 4508 case X86ISD::SHUFPS: 4509 case X86ISD::SHUFPD: 4510 ImmN = N->getOperand(N->getNumOperands()-1); 4511 DecodeSHUFPSMask(NumElems, 4512 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4513 ShuffleMask); 4514 break; 4515 case X86ISD::PUNPCKHBW: 4516 case X86ISD::PUNPCKHWD: 4517 case X86ISD::PUNPCKHDQ: 4518 case X86ISD::PUNPCKHQDQ: 4519 DecodePUNPCKHMask(NumElems, ShuffleMask); 4520 break; 4521 case X86ISD::UNPCKHPS: 4522 case X86ISD::UNPCKHPD: 4523 case X86ISD::VUNPCKHPSY: 4524 case X86ISD::VUNPCKHPDY: 4525 DecodeUNPCKHPMask(NumElems, ShuffleMask); 4526 break; 4527 case X86ISD::PUNPCKLBW: 4528 case X86ISD::PUNPCKLWD: 4529 case X86ISD::PUNPCKLDQ: 4530 case X86ISD::PUNPCKLQDQ: 4531 DecodePUNPCKLMask(VT, ShuffleMask); 4532 break; 4533 case X86ISD::UNPCKLPS: 4534 case X86ISD::UNPCKLPD: 4535 case X86ISD::VUNPCKLPSY: 4536 case X86ISD::VUNPCKLPDY: 4537 DecodeUNPCKLPMask(VT, ShuffleMask); 4538 break; 4539 case X86ISD::MOVHLPS: 4540 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4541 break; 4542 case X86ISD::MOVLHPS: 4543 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4544 break; 4545 case X86ISD::PSHUFD: 4546 ImmN = N->getOperand(N->getNumOperands()-1); 4547 DecodePSHUFMask(NumElems, 4548 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4549 ShuffleMask); 4550 break; 4551 case X86ISD::PSHUFHW: 4552 ImmN = N->getOperand(N->getNumOperands()-1); 4553 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4554 ShuffleMask); 4555 break; 4556 case X86ISD::PSHUFLW: 4557 ImmN = N->getOperand(N->getNumOperands()-1); 4558 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4559 ShuffleMask); 4560 break; 4561 case X86ISD::MOVSS: 4562 case X86ISD::MOVSD: { 4563 // The index 0 always comes from the first element of the second source, 4564 // this is why MOVSS and MOVSD are used in the first place. The other 4565 // elements come from the other positions of the first source vector. 4566 unsigned OpNum = (Index == 0) ? 1 : 0; 4567 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4568 Depth+1); 4569 } 4570 case X86ISD::VPERMILPS: 4571 ImmN = N->getOperand(N->getNumOperands()-1); 4572 DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4573 ShuffleMask); 4574 break; 4575 case X86ISD::VPERMILPSY: 4576 ImmN = N->getOperand(N->getNumOperands()-1); 4577 DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4578 ShuffleMask); 4579 break; 4580 case X86ISD::VPERMILPD: 4581 ImmN = N->getOperand(N->getNumOperands()-1); 4582 DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4583 ShuffleMask); 4584 break; 4585 case X86ISD::VPERMILPDY: 4586 ImmN = N->getOperand(N->getNumOperands()-1); 4587 DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4588 ShuffleMask); 4589 break; 4590 case X86ISD::VPERM2F128: 4591 ImmN = N->getOperand(N->getNumOperands()-1); 4592 DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4593 ShuffleMask); 4594 break; 4595 default: 4596 assert("not implemented for target shuffle node"); 4597 return SDValue(); 4598 } 4599 4600 Index = ShuffleMask[Index]; 4601 if (Index < 0) 4602 return DAG.getUNDEF(VT.getVectorElementType()); 4603 4604 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4605 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4606 Depth+1); 4607 } 4608 4609 // Actual nodes that may contain scalar elements 4610 if (Opcode == ISD::BITCAST) { 4611 V = V.getOperand(0); 4612 EVT SrcVT = V.getValueType(); 4613 unsigned NumElems = VT.getVectorNumElements(); 4614 4615 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4616 return SDValue(); 4617 } 4618 4619 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4620 return (Index == 0) ? V.getOperand(0) 4621 : DAG.getUNDEF(VT.getVectorElementType()); 4622 4623 if (V.getOpcode() == ISD::BUILD_VECTOR) 4624 return V.getOperand(Index); 4625 4626 return SDValue(); 4627} 4628 4629/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4630/// shuffle operation which come from a consecutively from a zero. The 4631/// search can start in two different directions, from left or right. 4632static 4633unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4634 bool ZerosFromLeft, SelectionDAG &DAG) { 4635 int i = 0; 4636 4637 while (i < NumElems) { 4638 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4639 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4640 if (!(Elt.getNode() && 4641 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4642 break; 4643 ++i; 4644 } 4645 4646 return i; 4647} 4648 4649/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4650/// MaskE correspond consecutively to elements from one of the vector operands, 4651/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4652static 4653bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4654 int OpIdx, int NumElems, unsigned &OpNum) { 4655 bool SeenV1 = false; 4656 bool SeenV2 = false; 4657 4658 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4659 int Idx = SVOp->getMaskElt(i); 4660 // Ignore undef indicies 4661 if (Idx < 0) 4662 continue; 4663 4664 if (Idx < NumElems) 4665 SeenV1 = true; 4666 else 4667 SeenV2 = true; 4668 4669 // Only accept consecutive elements from the same vector 4670 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4671 return false; 4672 } 4673 4674 OpNum = SeenV1 ? 0 : 1; 4675 return true; 4676} 4677 4678/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4679/// logical left shift of a vector. 4680static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4681 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4682 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4683 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4684 false /* check zeros from right */, DAG); 4685 unsigned OpSrc; 4686 4687 if (!NumZeros) 4688 return false; 4689 4690 // Considering the elements in the mask that are not consecutive zeros, 4691 // check if they consecutively come from only one of the source vectors. 4692 // 4693 // V1 = {X, A, B, C} 0 4694 // \ \ \ / 4695 // vector_shuffle V1, V2 <1, 2, 3, X> 4696 // 4697 if (!isShuffleMaskConsecutive(SVOp, 4698 0, // Mask Start Index 4699 NumElems-NumZeros-1, // Mask End Index 4700 NumZeros, // Where to start looking in the src vector 4701 NumElems, // Number of elements in vector 4702 OpSrc)) // Which source operand ? 4703 return false; 4704 4705 isLeft = false; 4706 ShAmt = NumZeros; 4707 ShVal = SVOp->getOperand(OpSrc); 4708 return true; 4709} 4710 4711/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4712/// logical left shift of a vector. 4713static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4714 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4715 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4716 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4717 true /* check zeros from left */, DAG); 4718 unsigned OpSrc; 4719 4720 if (!NumZeros) 4721 return false; 4722 4723 // Considering the elements in the mask that are not consecutive zeros, 4724 // check if they consecutively come from only one of the source vectors. 4725 // 4726 // 0 { A, B, X, X } = V2 4727 // / \ / / 4728 // vector_shuffle V1, V2 <X, X, 4, 5> 4729 // 4730 if (!isShuffleMaskConsecutive(SVOp, 4731 NumZeros, // Mask Start Index 4732 NumElems-1, // Mask End Index 4733 0, // Where to start looking in the src vector 4734 NumElems, // Number of elements in vector 4735 OpSrc)) // Which source operand ? 4736 return false; 4737 4738 isLeft = true; 4739 ShAmt = NumZeros; 4740 ShVal = SVOp->getOperand(OpSrc); 4741 return true; 4742} 4743 4744/// isVectorShift - Returns true if the shuffle can be implemented as a 4745/// logical left or right shift of a vector. 4746static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4747 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4748 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4749 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4750 return true; 4751 4752 return false; 4753} 4754 4755/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4756/// 4757static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4758 unsigned NumNonZero, unsigned NumZero, 4759 SelectionDAG &DAG, 4760 const TargetLowering &TLI) { 4761 if (NumNonZero > 8) 4762 return SDValue(); 4763 4764 DebugLoc dl = Op.getDebugLoc(); 4765 SDValue V(0, 0); 4766 bool First = true; 4767 for (unsigned i = 0; i < 16; ++i) { 4768 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4769 if (ThisIsNonZero && First) { 4770 if (NumZero) 4771 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4772 else 4773 V = DAG.getUNDEF(MVT::v8i16); 4774 First = false; 4775 } 4776 4777 if ((i & 1) != 0) { 4778 SDValue ThisElt(0, 0), LastElt(0, 0); 4779 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4780 if (LastIsNonZero) { 4781 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4782 MVT::i16, Op.getOperand(i-1)); 4783 } 4784 if (ThisIsNonZero) { 4785 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4786 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4787 ThisElt, DAG.getConstant(8, MVT::i8)); 4788 if (LastIsNonZero) 4789 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4790 } else 4791 ThisElt = LastElt; 4792 4793 if (ThisElt.getNode()) 4794 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4795 DAG.getIntPtrConstant(i/2)); 4796 } 4797 } 4798 4799 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4800} 4801 4802/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4803/// 4804static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4805 unsigned NumNonZero, unsigned NumZero, 4806 SelectionDAG &DAG, 4807 const TargetLowering &TLI) { 4808 if (NumNonZero > 4) 4809 return SDValue(); 4810 4811 DebugLoc dl = Op.getDebugLoc(); 4812 SDValue V(0, 0); 4813 bool First = true; 4814 for (unsigned i = 0; i < 8; ++i) { 4815 bool isNonZero = (NonZeros & (1 << i)) != 0; 4816 if (isNonZero) { 4817 if (First) { 4818 if (NumZero) 4819 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4820 else 4821 V = DAG.getUNDEF(MVT::v8i16); 4822 First = false; 4823 } 4824 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4825 MVT::v8i16, V, Op.getOperand(i), 4826 DAG.getIntPtrConstant(i)); 4827 } 4828 } 4829 4830 return V; 4831} 4832 4833/// getVShift - Return a vector logical shift node. 4834/// 4835static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4836 unsigned NumBits, SelectionDAG &DAG, 4837 const TargetLowering &TLI, DebugLoc dl) { 4838 EVT ShVT = MVT::v2i64; 4839 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4840 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4841 return DAG.getNode(ISD::BITCAST, dl, VT, 4842 DAG.getNode(Opc, dl, ShVT, SrcOp, 4843 DAG.getConstant(NumBits, 4844 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4845} 4846 4847SDValue 4848X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4849 SelectionDAG &DAG) const { 4850 4851 // Check if the scalar load can be widened into a vector load. And if 4852 // the address is "base + cst" see if the cst can be "absorbed" into 4853 // the shuffle mask. 4854 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4855 SDValue Ptr = LD->getBasePtr(); 4856 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4857 return SDValue(); 4858 EVT PVT = LD->getValueType(0); 4859 if (PVT != MVT::i32 && PVT != MVT::f32) 4860 return SDValue(); 4861 4862 int FI = -1; 4863 int64_t Offset = 0; 4864 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4865 FI = FINode->getIndex(); 4866 Offset = 0; 4867 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4868 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4869 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4870 Offset = Ptr.getConstantOperandVal(1); 4871 Ptr = Ptr.getOperand(0); 4872 } else { 4873 return SDValue(); 4874 } 4875 4876 // FIXME: 256-bit vector instructions don't require a strict alignment, 4877 // improve this code to support it better. 4878 unsigned RequiredAlign = VT.getSizeInBits()/8; 4879 SDValue Chain = LD->getChain(); 4880 // Make sure the stack object alignment is at least 16 or 32. 4881 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4882 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4883 if (MFI->isFixedObjectIndex(FI)) { 4884 // Can't change the alignment. FIXME: It's possible to compute 4885 // the exact stack offset and reference FI + adjust offset instead. 4886 // If someone *really* cares about this. That's the way to implement it. 4887 return SDValue(); 4888 } else { 4889 MFI->setObjectAlignment(FI, RequiredAlign); 4890 } 4891 } 4892 4893 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4894 // Ptr + (Offset & ~15). 4895 if (Offset < 0) 4896 return SDValue(); 4897 if ((Offset % RequiredAlign) & 3) 4898 return SDValue(); 4899 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4900 if (StartOffset) 4901 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4902 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4903 4904 int EltNo = (Offset - StartOffset) >> 2; 4905 int NumElems = VT.getVectorNumElements(); 4906 4907 EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; 4908 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4909 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4910 LD->getPointerInfo().getWithOffset(StartOffset), 4911 false, false, 0); 4912 4913 // Canonicalize it to a v4i32 or v8i32 shuffle. 4914 SmallVector<int, 8> Mask; 4915 for (int i = 0; i < NumElems; ++i) 4916 Mask.push_back(EltNo); 4917 4918 V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); 4919 return DAG.getNode(ISD::BITCAST, dl, NVT, 4920 DAG.getVectorShuffle(CanonVT, dl, V1, 4921 DAG.getUNDEF(CanonVT),&Mask[0])); 4922 } 4923 4924 return SDValue(); 4925} 4926 4927/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4928/// vector of type 'VT', see if the elements can be replaced by a single large 4929/// load which has the same value as a build_vector whose operands are 'elts'. 4930/// 4931/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4932/// 4933/// FIXME: we'd also like to handle the case where the last elements are zero 4934/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4935/// There's even a handy isZeroNode for that purpose. 4936static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4937 DebugLoc &DL, SelectionDAG &DAG) { 4938 EVT EltVT = VT.getVectorElementType(); 4939 unsigned NumElems = Elts.size(); 4940 4941 LoadSDNode *LDBase = NULL; 4942 unsigned LastLoadedElt = -1U; 4943 4944 // For each element in the initializer, see if we've found a load or an undef. 4945 // If we don't find an initial load element, or later load elements are 4946 // non-consecutive, bail out. 4947 for (unsigned i = 0; i < NumElems; ++i) { 4948 SDValue Elt = Elts[i]; 4949 4950 if (!Elt.getNode() || 4951 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4952 return SDValue(); 4953 if (!LDBase) { 4954 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4955 return SDValue(); 4956 LDBase = cast<LoadSDNode>(Elt.getNode()); 4957 LastLoadedElt = i; 4958 continue; 4959 } 4960 if (Elt.getOpcode() == ISD::UNDEF) 4961 continue; 4962 4963 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4964 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4965 return SDValue(); 4966 LastLoadedElt = i; 4967 } 4968 4969 // If we have found an entire vector of loads and undefs, then return a large 4970 // load of the entire vector width starting at the base pointer. If we found 4971 // consecutive loads for the low half, generate a vzext_load node. 4972 if (LastLoadedElt == NumElems - 1) { 4973 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4974 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4975 LDBase->getPointerInfo(), 4976 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4977 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4978 LDBase->getPointerInfo(), 4979 LDBase->isVolatile(), LDBase->isNonTemporal(), 4980 LDBase->getAlignment()); 4981 } else if (NumElems == 4 && LastLoadedElt == 1 && 4982 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4983 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4984 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4985 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4986 Ops, 2, MVT::i32, 4987 LDBase->getMemOperand()); 4988 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4989 } 4990 return SDValue(); 4991} 4992 4993SDValue 4994X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4995 DebugLoc dl = Op.getDebugLoc(); 4996 4997 EVT VT = Op.getValueType(); 4998 EVT ExtVT = VT.getVectorElementType(); 4999 unsigned NumElems = Op.getNumOperands(); 5000 5001 // Vectors containing all zeros can be matched by pxor and xorps later 5002 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5003 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5004 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5005 if (Op.getValueType() == MVT::v4i32 || 5006 Op.getValueType() == MVT::v8i32) 5007 return Op; 5008 5009 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 5010 } 5011 5012 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5013 // vectors or broken into v4i32 operations on 256-bit vectors. 5014 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5015 if (Op.getValueType() == MVT::v4i32) 5016 return Op; 5017 5018 return getOnesVector(Op.getValueType(), DAG, dl); 5019 } 5020 5021 unsigned EVTBits = ExtVT.getSizeInBits(); 5022 5023 unsigned NumZero = 0; 5024 unsigned NumNonZero = 0; 5025 unsigned NonZeros = 0; 5026 bool IsAllConstants = true; 5027 SmallSet<SDValue, 8> Values; 5028 for (unsigned i = 0; i < NumElems; ++i) { 5029 SDValue Elt = Op.getOperand(i); 5030 if (Elt.getOpcode() == ISD::UNDEF) 5031 continue; 5032 Values.insert(Elt); 5033 if (Elt.getOpcode() != ISD::Constant && 5034 Elt.getOpcode() != ISD::ConstantFP) 5035 IsAllConstants = false; 5036 if (X86::isZeroNode(Elt)) 5037 NumZero++; 5038 else { 5039 NonZeros |= (1 << i); 5040 NumNonZero++; 5041 } 5042 } 5043 5044 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5045 if (NumNonZero == 0) 5046 return DAG.getUNDEF(VT); 5047 5048 // Special case for single non-zero, non-undef, element. 5049 if (NumNonZero == 1) { 5050 unsigned Idx = CountTrailingZeros_32(NonZeros); 5051 SDValue Item = Op.getOperand(Idx); 5052 5053 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5054 // the value are obviously zero, truncate the value to i32 and do the 5055 // insertion that way. Only do this if the value is non-constant or if the 5056 // value is a constant being inserted into element 0. It is cheaper to do 5057 // a constant pool load than it is to do a movd + shuffle. 5058 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5059 (!IsAllConstants || Idx == 0)) { 5060 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5061 // Handle SSE only. 5062 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5063 EVT VecVT = MVT::v4i32; 5064 unsigned VecElts = 4; 5065 5066 // Truncate the value (which may itself be a constant) to i32, and 5067 // convert it to a vector with movd (S2V+shuffle to zero extend). 5068 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5069 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5070 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 5071 Subtarget->hasSSE2(), DAG); 5072 5073 // Now we have our 32-bit value zero extended in the low element of 5074 // a vector. If Idx != 0, swizzle it into place. 5075 if (Idx != 0) { 5076 SmallVector<int, 4> Mask; 5077 Mask.push_back(Idx); 5078 for (unsigned i = 1; i != VecElts; ++i) 5079 Mask.push_back(i); 5080 Item = DAG.getVectorShuffle(VecVT, dl, Item, 5081 DAG.getUNDEF(Item.getValueType()), 5082 &Mask[0]); 5083 } 5084 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 5085 } 5086 } 5087 5088 // If we have a constant or non-constant insertion into the low element of 5089 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5090 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5091 // depending on what the source datatype is. 5092 if (Idx == 0) { 5093 if (NumZero == 0) { 5094 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5095 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5096 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5097 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5098 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5099 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 5100 DAG); 5101 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5102 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5103 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 5104 EVT MiddleVT = MVT::v4i32; 5105 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 5106 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 5107 Subtarget->hasSSE2(), DAG); 5108 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5109 } 5110 } 5111 5112 // Is it a vector logical left shift? 5113 if (NumElems == 2 && Idx == 1 && 5114 X86::isZeroNode(Op.getOperand(0)) && 5115 !X86::isZeroNode(Op.getOperand(1))) { 5116 unsigned NumBits = VT.getSizeInBits(); 5117 return getVShift(true, VT, 5118 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5119 VT, Op.getOperand(1)), 5120 NumBits/2, DAG, *this, dl); 5121 } 5122 5123 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5124 return SDValue(); 5125 5126 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5127 // is a non-constant being inserted into an element other than the low one, 5128 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5129 // movd/movss) to move this into the low element, then shuffle it into 5130 // place. 5131 if (EVTBits == 32) { 5132 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5133 5134 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5135 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 5136 Subtarget->hasSSE2(), DAG); 5137 SmallVector<int, 8> MaskVec; 5138 for (unsigned i = 0; i < NumElems; i++) 5139 MaskVec.push_back(i == Idx ? 0 : 1); 5140 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5141 } 5142 } 5143 5144 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5145 if (Values.size() == 1) { 5146 if (EVTBits == 32) { 5147 // Instead of a shuffle like this: 5148 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5149 // Check if it's possible to issue this instead. 5150 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5151 unsigned Idx = CountTrailingZeros_32(NonZeros); 5152 SDValue Item = Op.getOperand(Idx); 5153 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5154 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5155 } 5156 return SDValue(); 5157 } 5158 5159 // A vector full of immediates; various special cases are already 5160 // handled, so this is best done with a single constant-pool load. 5161 if (IsAllConstants) 5162 return SDValue(); 5163 5164 // For AVX-length vectors, build the individual 128-bit pieces and use 5165 // shuffles to put them in place. 5166 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 5167 SmallVector<SDValue, 32> V; 5168 for (unsigned i = 0; i < NumElems; ++i) 5169 V.push_back(Op.getOperand(i)); 5170 5171 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5172 5173 // Build both the lower and upper subvector. 5174 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5175 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5176 NumElems/2); 5177 5178 // Recreate the wider vector with the lower and upper part. 5179 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 5180 DAG.getConstant(0, MVT::i32), DAG, dl); 5181 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 5182 DAG, dl); 5183 } 5184 5185 // Let legalizer expand 2-wide build_vectors. 5186 if (EVTBits == 64) { 5187 if (NumNonZero == 1) { 5188 // One half is zero or undef. 5189 unsigned Idx = CountTrailingZeros_32(NonZeros); 5190 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5191 Op.getOperand(Idx)); 5192 return getShuffleVectorZeroOrUndef(V2, Idx, true, 5193 Subtarget->hasSSE2(), DAG); 5194 } 5195 return SDValue(); 5196 } 5197 5198 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5199 if (EVTBits == 8 && NumElems == 16) { 5200 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5201 *this); 5202 if (V.getNode()) return V; 5203 } 5204 5205 if (EVTBits == 16 && NumElems == 8) { 5206 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5207 *this); 5208 if (V.getNode()) return V; 5209 } 5210 5211 // If element VT is == 32 bits, turn it into a number of shuffles. 5212 SmallVector<SDValue, 8> V; 5213 V.resize(NumElems); 5214 if (NumElems == 4 && NumZero > 0) { 5215 for (unsigned i = 0; i < 4; ++i) { 5216 bool isZero = !(NonZeros & (1 << i)); 5217 if (isZero) 5218 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5219 else 5220 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5221 } 5222 5223 for (unsigned i = 0; i < 2; ++i) { 5224 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5225 default: break; 5226 case 0: 5227 V[i] = V[i*2]; // Must be a zero vector. 5228 break; 5229 case 1: 5230 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5231 break; 5232 case 2: 5233 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5234 break; 5235 case 3: 5236 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5237 break; 5238 } 5239 } 5240 5241 SmallVector<int, 8> MaskVec; 5242 bool Reverse = (NonZeros & 0x3) == 2; 5243 for (unsigned i = 0; i < 2; ++i) 5244 MaskVec.push_back(Reverse ? 1-i : i); 5245 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5246 for (unsigned i = 0; i < 2; ++i) 5247 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 5248 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5249 } 5250 5251 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 5252 // Check for a build vector of consecutive loads. 5253 for (unsigned i = 0; i < NumElems; ++i) 5254 V[i] = Op.getOperand(i); 5255 5256 // Check for elements which are consecutive loads. 5257 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5258 if (LD.getNode()) 5259 return LD; 5260 5261 // For SSE 4.1, use insertps to put the high elements into the low element. 5262 if (getSubtarget()->hasSSE41()) { 5263 SDValue Result; 5264 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5265 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5266 else 5267 Result = DAG.getUNDEF(VT); 5268 5269 for (unsigned i = 1; i < NumElems; ++i) { 5270 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5271 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5272 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5273 } 5274 return Result; 5275 } 5276 5277 // Otherwise, expand into a number of unpckl*, start by extending each of 5278 // our (non-undef) elements to the full vector width with the element in the 5279 // bottom slot of the vector (which generates no code for SSE). 5280 for (unsigned i = 0; i < NumElems; ++i) { 5281 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5282 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5283 else 5284 V[i] = DAG.getUNDEF(VT); 5285 } 5286 5287 // Next, we iteratively mix elements, e.g. for v4f32: 5288 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5289 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5290 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5291 unsigned EltStride = NumElems >> 1; 5292 while (EltStride != 0) { 5293 for (unsigned i = 0; i < EltStride; ++i) { 5294 // If V[i+EltStride] is undef and this is the first round of mixing, 5295 // then it is safe to just drop this shuffle: V[i] is already in the 5296 // right place, the one element (since it's the first round) being 5297 // inserted as undef can be dropped. This isn't safe for successive 5298 // rounds because they will permute elements within both vectors. 5299 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5300 EltStride == NumElems/2) 5301 continue; 5302 5303 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5304 } 5305 EltStride >>= 1; 5306 } 5307 return V[0]; 5308 } 5309 return SDValue(); 5310} 5311 5312// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place 5313// them in a MMX register. This is better than doing a stack convert. 5314static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5315 DebugLoc dl = Op.getDebugLoc(); 5316 EVT ResVT = Op.getValueType(); 5317 5318 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 5319 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 5320 int Mask[2]; 5321 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 5322 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5323 InVec = Op.getOperand(1); 5324 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5325 unsigned NumElts = ResVT.getVectorNumElements(); 5326 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5327 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 5328 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 5329 } else { 5330 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 5331 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5332 Mask[0] = 0; Mask[1] = 2; 5333 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 5334 } 5335 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5336} 5337 5338// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5339// to create 256-bit vectors from two other 128-bit ones. 5340static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5341 DebugLoc dl = Op.getDebugLoc(); 5342 EVT ResVT = Op.getValueType(); 5343 5344 assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); 5345 5346 SDValue V1 = Op.getOperand(0); 5347 SDValue V2 = Op.getOperand(1); 5348 unsigned NumElems = ResVT.getVectorNumElements(); 5349 5350 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, 5351 DAG.getConstant(0, MVT::i32), DAG, dl); 5352 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5353 DAG, dl); 5354} 5355 5356SDValue 5357X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5358 EVT ResVT = Op.getValueType(); 5359 5360 assert(Op.getNumOperands() == 2); 5361 assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && 5362 "Unsupported CONCAT_VECTORS for value type"); 5363 5364 // We support concatenate two MMX registers and place them in a MMX register. 5365 // This is better than doing a stack convert. 5366 if (ResVT.is128BitVector()) 5367 return LowerMMXCONCAT_VECTORS(Op, DAG); 5368 5369 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5370 // from two other 128-bit ones. 5371 return LowerAVXCONCAT_VECTORS(Op, DAG); 5372} 5373 5374// v8i16 shuffles - Prefer shuffles in the following order: 5375// 1. [all] pshuflw, pshufhw, optional move 5376// 2. [ssse3] 1 x pshufb 5377// 3. [ssse3] 2 x pshufb + 1 x por 5378// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5379SDValue 5380X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5381 SelectionDAG &DAG) const { 5382 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5383 SDValue V1 = SVOp->getOperand(0); 5384 SDValue V2 = SVOp->getOperand(1); 5385 DebugLoc dl = SVOp->getDebugLoc(); 5386 SmallVector<int, 8> MaskVals; 5387 5388 // Determine if more than 1 of the words in each of the low and high quadwords 5389 // of the result come from the same quadword of one of the two inputs. Undef 5390 // mask values count as coming from any quadword, for better codegen. 5391 SmallVector<unsigned, 4> LoQuad(4); 5392 SmallVector<unsigned, 4> HiQuad(4); 5393 BitVector InputQuads(4); 5394 for (unsigned i = 0; i < 8; ++i) { 5395 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 5396 int EltIdx = SVOp->getMaskElt(i); 5397 MaskVals.push_back(EltIdx); 5398 if (EltIdx < 0) { 5399 ++Quad[0]; 5400 ++Quad[1]; 5401 ++Quad[2]; 5402 ++Quad[3]; 5403 continue; 5404 } 5405 ++Quad[EltIdx / 4]; 5406 InputQuads.set(EltIdx / 4); 5407 } 5408 5409 int BestLoQuad = -1; 5410 unsigned MaxQuad = 1; 5411 for (unsigned i = 0; i < 4; ++i) { 5412 if (LoQuad[i] > MaxQuad) { 5413 BestLoQuad = i; 5414 MaxQuad = LoQuad[i]; 5415 } 5416 } 5417 5418 int BestHiQuad = -1; 5419 MaxQuad = 1; 5420 for (unsigned i = 0; i < 4; ++i) { 5421 if (HiQuad[i] > MaxQuad) { 5422 BestHiQuad = i; 5423 MaxQuad = HiQuad[i]; 5424 } 5425 } 5426 5427 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5428 // of the two input vectors, shuffle them into one input vector so only a 5429 // single pshufb instruction is necessary. If There are more than 2 input 5430 // quads, disable the next transformation since it does not help SSSE3. 5431 bool V1Used = InputQuads[0] || InputQuads[1]; 5432 bool V2Used = InputQuads[2] || InputQuads[3]; 5433 if (Subtarget->hasSSSE3()) { 5434 if (InputQuads.count() == 2 && V1Used && V2Used) { 5435 BestLoQuad = InputQuads.find_first(); 5436 BestHiQuad = InputQuads.find_next(BestLoQuad); 5437 } 5438 if (InputQuads.count() > 2) { 5439 BestLoQuad = -1; 5440 BestHiQuad = -1; 5441 } 5442 } 5443 5444 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5445 // the shuffle mask. If a quad is scored as -1, that means that it contains 5446 // words from all 4 input quadwords. 5447 SDValue NewV; 5448 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5449 SmallVector<int, 8> MaskV; 5450 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 5451 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 5452 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5453 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5454 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5455 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5456 5457 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5458 // source words for the shuffle, to aid later transformations. 5459 bool AllWordsInNewV = true; 5460 bool InOrder[2] = { true, true }; 5461 for (unsigned i = 0; i != 8; ++i) { 5462 int idx = MaskVals[i]; 5463 if (idx != (int)i) 5464 InOrder[i/4] = false; 5465 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5466 continue; 5467 AllWordsInNewV = false; 5468 break; 5469 } 5470 5471 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5472 if (AllWordsInNewV) { 5473 for (int i = 0; i != 8; ++i) { 5474 int idx = MaskVals[i]; 5475 if (idx < 0) 5476 continue; 5477 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5478 if ((idx != i) && idx < 4) 5479 pshufhw = false; 5480 if ((idx != i) && idx > 3) 5481 pshuflw = false; 5482 } 5483 V1 = NewV; 5484 V2Used = false; 5485 BestLoQuad = 0; 5486 BestHiQuad = 1; 5487 } 5488 5489 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5490 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5491 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5492 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5493 unsigned TargetMask = 0; 5494 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5495 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5496 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5497 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5498 V1 = NewV.getOperand(0); 5499 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5500 } 5501 } 5502 5503 // If we have SSSE3, and all words of the result are from 1 input vector, 5504 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5505 // is present, fall back to case 4. 5506 if (Subtarget->hasSSSE3()) { 5507 SmallVector<SDValue,16> pshufbMask; 5508 5509 // If we have elements from both input vectors, set the high bit of the 5510 // shuffle mask element to zero out elements that come from V2 in the V1 5511 // mask, and elements that come from V1 in the V2 mask, so that the two 5512 // results can be OR'd together. 5513 bool TwoInputs = V1Used && V2Used; 5514 for (unsigned i = 0; i != 8; ++i) { 5515 int EltIdx = MaskVals[i] * 2; 5516 if (TwoInputs && (EltIdx >= 16)) { 5517 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5518 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5519 continue; 5520 } 5521 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5522 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5523 } 5524 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5525 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5526 DAG.getNode(ISD::BUILD_VECTOR, dl, 5527 MVT::v16i8, &pshufbMask[0], 16)); 5528 if (!TwoInputs) 5529 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5530 5531 // Calculate the shuffle mask for the second input, shuffle it, and 5532 // OR it with the first shuffled input. 5533 pshufbMask.clear(); 5534 for (unsigned i = 0; i != 8; ++i) { 5535 int EltIdx = MaskVals[i] * 2; 5536 if (EltIdx < 16) { 5537 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5538 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5539 continue; 5540 } 5541 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5542 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5543 } 5544 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5545 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5546 DAG.getNode(ISD::BUILD_VECTOR, dl, 5547 MVT::v16i8, &pshufbMask[0], 16)); 5548 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5549 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5550 } 5551 5552 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5553 // and update MaskVals with new element order. 5554 BitVector InOrder(8); 5555 if (BestLoQuad >= 0) { 5556 SmallVector<int, 8> MaskV; 5557 for (int i = 0; i != 4; ++i) { 5558 int idx = MaskVals[i]; 5559 if (idx < 0) { 5560 MaskV.push_back(-1); 5561 InOrder.set(i); 5562 } else if ((idx / 4) == BestLoQuad) { 5563 MaskV.push_back(idx & 3); 5564 InOrder.set(i); 5565 } else { 5566 MaskV.push_back(-1); 5567 } 5568 } 5569 for (unsigned i = 4; i != 8; ++i) 5570 MaskV.push_back(i); 5571 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5572 &MaskV[0]); 5573 5574 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5575 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5576 NewV.getOperand(0), 5577 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5578 DAG); 5579 } 5580 5581 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5582 // and update MaskVals with the new element order. 5583 if (BestHiQuad >= 0) { 5584 SmallVector<int, 8> MaskV; 5585 for (unsigned i = 0; i != 4; ++i) 5586 MaskV.push_back(i); 5587 for (unsigned i = 4; i != 8; ++i) { 5588 int idx = MaskVals[i]; 5589 if (idx < 0) { 5590 MaskV.push_back(-1); 5591 InOrder.set(i); 5592 } else if ((idx / 4) == BestHiQuad) { 5593 MaskV.push_back((idx & 3) + 4); 5594 InOrder.set(i); 5595 } else { 5596 MaskV.push_back(-1); 5597 } 5598 } 5599 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5600 &MaskV[0]); 5601 5602 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5603 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5604 NewV.getOperand(0), 5605 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5606 DAG); 5607 } 5608 5609 // In case BestHi & BestLo were both -1, which means each quadword has a word 5610 // from each of the four input quadwords, calculate the InOrder bitvector now 5611 // before falling through to the insert/extract cleanup. 5612 if (BestLoQuad == -1 && BestHiQuad == -1) { 5613 NewV = V1; 5614 for (int i = 0; i != 8; ++i) 5615 if (MaskVals[i] < 0 || MaskVals[i] == i) 5616 InOrder.set(i); 5617 } 5618 5619 // The other elements are put in the right place using pextrw and pinsrw. 5620 for (unsigned i = 0; i != 8; ++i) { 5621 if (InOrder[i]) 5622 continue; 5623 int EltIdx = MaskVals[i]; 5624 if (EltIdx < 0) 5625 continue; 5626 SDValue ExtOp = (EltIdx < 8) 5627 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5628 DAG.getIntPtrConstant(EltIdx)) 5629 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5630 DAG.getIntPtrConstant(EltIdx - 8)); 5631 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5632 DAG.getIntPtrConstant(i)); 5633 } 5634 return NewV; 5635} 5636 5637// v16i8 shuffles - Prefer shuffles in the following order: 5638// 1. [ssse3] 1 x pshufb 5639// 2. [ssse3] 2 x pshufb + 1 x por 5640// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5641static 5642SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5643 SelectionDAG &DAG, 5644 const X86TargetLowering &TLI) { 5645 SDValue V1 = SVOp->getOperand(0); 5646 SDValue V2 = SVOp->getOperand(1); 5647 DebugLoc dl = SVOp->getDebugLoc(); 5648 SmallVector<int, 16> MaskVals; 5649 SVOp->getMask(MaskVals); 5650 5651 // If we have SSSE3, case 1 is generated when all result bytes come from 5652 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5653 // present, fall back to case 3. 5654 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5655 bool V1Only = true; 5656 bool V2Only = true; 5657 for (unsigned i = 0; i < 16; ++i) { 5658 int EltIdx = MaskVals[i]; 5659 if (EltIdx < 0) 5660 continue; 5661 if (EltIdx < 16) 5662 V2Only = false; 5663 else 5664 V1Only = false; 5665 } 5666 5667 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5668 if (TLI.getSubtarget()->hasSSSE3()) { 5669 SmallVector<SDValue,16> pshufbMask; 5670 5671 // If all result elements are from one input vector, then only translate 5672 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5673 // 5674 // Otherwise, we have elements from both input vectors, and must zero out 5675 // elements that come from V2 in the first mask, and V1 in the second mask 5676 // so that we can OR them together. 5677 bool TwoInputs = !(V1Only || V2Only); 5678 for (unsigned i = 0; i != 16; ++i) { 5679 int EltIdx = MaskVals[i]; 5680 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5681 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5682 continue; 5683 } 5684 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5685 } 5686 // If all the elements are from V2, assign it to V1 and return after 5687 // building the first pshufb. 5688 if (V2Only) 5689 V1 = V2; 5690 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5691 DAG.getNode(ISD::BUILD_VECTOR, dl, 5692 MVT::v16i8, &pshufbMask[0], 16)); 5693 if (!TwoInputs) 5694 return V1; 5695 5696 // Calculate the shuffle mask for the second input, shuffle it, and 5697 // OR it with the first shuffled input. 5698 pshufbMask.clear(); 5699 for (unsigned i = 0; i != 16; ++i) { 5700 int EltIdx = MaskVals[i]; 5701 if (EltIdx < 16) { 5702 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5703 continue; 5704 } 5705 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5706 } 5707 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5708 DAG.getNode(ISD::BUILD_VECTOR, dl, 5709 MVT::v16i8, &pshufbMask[0], 16)); 5710 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5711 } 5712 5713 // No SSSE3 - Calculate in place words and then fix all out of place words 5714 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5715 // the 16 different words that comprise the two doublequadword input vectors. 5716 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5717 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5718 SDValue NewV = V2Only ? V2 : V1; 5719 for (int i = 0; i != 8; ++i) { 5720 int Elt0 = MaskVals[i*2]; 5721 int Elt1 = MaskVals[i*2+1]; 5722 5723 // This word of the result is all undef, skip it. 5724 if (Elt0 < 0 && Elt1 < 0) 5725 continue; 5726 5727 // This word of the result is already in the correct place, skip it. 5728 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5729 continue; 5730 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5731 continue; 5732 5733 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5734 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5735 SDValue InsElt; 5736 5737 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5738 // using a single extract together, load it and store it. 5739 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5740 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5741 DAG.getIntPtrConstant(Elt1 / 2)); 5742 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5743 DAG.getIntPtrConstant(i)); 5744 continue; 5745 } 5746 5747 // If Elt1 is defined, extract it from the appropriate source. If the 5748 // source byte is not also odd, shift the extracted word left 8 bits 5749 // otherwise clear the bottom 8 bits if we need to do an or. 5750 if (Elt1 >= 0) { 5751 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5752 DAG.getIntPtrConstant(Elt1 / 2)); 5753 if ((Elt1 & 1) == 0) 5754 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5755 DAG.getConstant(8, 5756 TLI.getShiftAmountTy(InsElt.getValueType()))); 5757 else if (Elt0 >= 0) 5758 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5759 DAG.getConstant(0xFF00, MVT::i16)); 5760 } 5761 // If Elt0 is defined, extract it from the appropriate source. If the 5762 // source byte is not also even, shift the extracted word right 8 bits. If 5763 // Elt1 was also defined, OR the extracted values together before 5764 // inserting them in the result. 5765 if (Elt0 >= 0) { 5766 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5767 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5768 if ((Elt0 & 1) != 0) 5769 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5770 DAG.getConstant(8, 5771 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5772 else if (Elt1 >= 0) 5773 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5774 DAG.getConstant(0x00FF, MVT::i16)); 5775 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5776 : InsElt0; 5777 } 5778 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5779 DAG.getIntPtrConstant(i)); 5780 } 5781 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5782} 5783 5784/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5785/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5786/// done when every pair / quad of shuffle mask elements point to elements in 5787/// the right sequence. e.g. 5788/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5789static 5790SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5791 SelectionDAG &DAG, DebugLoc dl) { 5792 EVT VT = SVOp->getValueType(0); 5793 SDValue V1 = SVOp->getOperand(0); 5794 SDValue V2 = SVOp->getOperand(1); 5795 unsigned NumElems = VT.getVectorNumElements(); 5796 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5797 EVT NewVT; 5798 switch (VT.getSimpleVT().SimpleTy) { 5799 default: assert(false && "Unexpected!"); 5800 case MVT::v4f32: NewVT = MVT::v2f64; break; 5801 case MVT::v4i32: NewVT = MVT::v2i64; break; 5802 case MVT::v8i16: NewVT = MVT::v4i32; break; 5803 case MVT::v16i8: NewVT = MVT::v4i32; break; 5804 } 5805 5806 int Scale = NumElems / NewWidth; 5807 SmallVector<int, 8> MaskVec; 5808 for (unsigned i = 0; i < NumElems; i += Scale) { 5809 int StartIdx = -1; 5810 for (int j = 0; j < Scale; ++j) { 5811 int EltIdx = SVOp->getMaskElt(i+j); 5812 if (EltIdx < 0) 5813 continue; 5814 if (StartIdx == -1) 5815 StartIdx = EltIdx - (EltIdx % Scale); 5816 if (EltIdx != StartIdx + j) 5817 return SDValue(); 5818 } 5819 if (StartIdx == -1) 5820 MaskVec.push_back(-1); 5821 else 5822 MaskVec.push_back(StartIdx / Scale); 5823 } 5824 5825 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5826 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5827 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5828} 5829 5830/// getVZextMovL - Return a zero-extending vector move low node. 5831/// 5832static SDValue getVZextMovL(EVT VT, EVT OpVT, 5833 SDValue SrcOp, SelectionDAG &DAG, 5834 const X86Subtarget *Subtarget, DebugLoc dl) { 5835 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5836 LoadSDNode *LD = NULL; 5837 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5838 LD = dyn_cast<LoadSDNode>(SrcOp); 5839 if (!LD) { 5840 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5841 // instead. 5842 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5843 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5844 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5845 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5846 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5847 // PR2108 5848 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5849 return DAG.getNode(ISD::BITCAST, dl, VT, 5850 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5851 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5852 OpVT, 5853 SrcOp.getOperand(0) 5854 .getOperand(0)))); 5855 } 5856 } 5857 } 5858 5859 return DAG.getNode(ISD::BITCAST, dl, VT, 5860 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5861 DAG.getNode(ISD::BITCAST, dl, 5862 OpVT, SrcOp))); 5863} 5864 5865/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector 5866/// shuffle node referes to only one lane in the sources. 5867static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { 5868 EVT VT = SVOp->getValueType(0); 5869 int NumElems = VT.getVectorNumElements(); 5870 int HalfSize = NumElems/2; 5871 SmallVector<int, 16> M; 5872 SVOp->getMask(M); 5873 bool MatchA = false, MatchB = false; 5874 5875 for (int l = 0; l < NumElems*2; l += HalfSize) { 5876 if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { 5877 MatchA = true; 5878 break; 5879 } 5880 } 5881 5882 for (int l = 0; l < NumElems*2; l += HalfSize) { 5883 if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { 5884 MatchB = true; 5885 break; 5886 } 5887 } 5888 5889 return MatchA && MatchB; 5890} 5891 5892/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5893/// which could not be matched by any known target speficic shuffle 5894static SDValue 5895LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5896 if (areShuffleHalvesWithinDisjointLanes(SVOp)) { 5897 // If each half of a vector shuffle node referes to only one lane in the 5898 // source vectors, extract each used 128-bit lane and shuffle them using 5899 // 128-bit shuffles. Then, concatenate the results. Otherwise leave 5900 // the work to the legalizer. 5901 DebugLoc dl = SVOp->getDebugLoc(); 5902 EVT VT = SVOp->getValueType(0); 5903 int NumElems = VT.getVectorNumElements(); 5904 int HalfSize = NumElems/2; 5905 5906 // Extract the reference for each half 5907 int FstVecExtractIdx = 0, SndVecExtractIdx = 0; 5908 int FstVecOpNum = 0, SndVecOpNum = 0; 5909 for (int i = 0; i < HalfSize; ++i) { 5910 int Elt = SVOp->getMaskElt(i); 5911 if (SVOp->getMaskElt(i) < 0) 5912 continue; 5913 FstVecOpNum = Elt/NumElems; 5914 FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5915 break; 5916 } 5917 for (int i = HalfSize; i < NumElems; ++i) { 5918 int Elt = SVOp->getMaskElt(i); 5919 if (SVOp->getMaskElt(i) < 0) 5920 continue; 5921 SndVecOpNum = Elt/NumElems; 5922 SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5923 break; 5924 } 5925 5926 // Extract the subvectors 5927 SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), 5928 DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); 5929 SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), 5930 DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); 5931 5932 // Generate 128-bit shuffles 5933 SmallVector<int, 16> MaskV1, MaskV2; 5934 for (int i = 0; i < HalfSize; ++i) { 5935 int Elt = SVOp->getMaskElt(i); 5936 MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); 5937 } 5938 for (int i = HalfSize; i < NumElems; ++i) { 5939 int Elt = SVOp->getMaskElt(i); 5940 MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); 5941 } 5942 5943 EVT NVT = V1.getValueType(); 5944 V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); 5945 V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); 5946 5947 // Concatenate the result back 5948 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, 5949 DAG.getConstant(0, MVT::i32), DAG, dl); 5950 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5951 DAG, dl); 5952 } 5953 5954 return SDValue(); 5955} 5956 5957/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5958/// 4 elements, and match them with several different shuffle types. 5959static SDValue 5960LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5961 SDValue V1 = SVOp->getOperand(0); 5962 SDValue V2 = SVOp->getOperand(1); 5963 DebugLoc dl = SVOp->getDebugLoc(); 5964 EVT VT = SVOp->getValueType(0); 5965 5966 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5967 5968 SmallVector<std::pair<int, int>, 8> Locs; 5969 Locs.resize(4); 5970 SmallVector<int, 8> Mask1(4U, -1); 5971 SmallVector<int, 8> PermMask; 5972 SVOp->getMask(PermMask); 5973 5974 unsigned NumHi = 0; 5975 unsigned NumLo = 0; 5976 for (unsigned i = 0; i != 4; ++i) { 5977 int Idx = PermMask[i]; 5978 if (Idx < 0) { 5979 Locs[i] = std::make_pair(-1, -1); 5980 } else { 5981 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5982 if (Idx < 4) { 5983 Locs[i] = std::make_pair(0, NumLo); 5984 Mask1[NumLo] = Idx; 5985 NumLo++; 5986 } else { 5987 Locs[i] = std::make_pair(1, NumHi); 5988 if (2+NumHi < 4) 5989 Mask1[2+NumHi] = Idx; 5990 NumHi++; 5991 } 5992 } 5993 } 5994 5995 if (NumLo <= 2 && NumHi <= 2) { 5996 // If no more than two elements come from either vector. This can be 5997 // implemented with two shuffles. First shuffle gather the elements. 5998 // The second shuffle, which takes the first shuffle as both of its 5999 // vector operands, put the elements into the right order. 6000 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6001 6002 SmallVector<int, 8> Mask2(4U, -1); 6003 6004 for (unsigned i = 0; i != 4; ++i) { 6005 if (Locs[i].first == -1) 6006 continue; 6007 else { 6008 unsigned Idx = (i < 2) ? 0 : 4; 6009 Idx += Locs[i].first * 2 + Locs[i].second; 6010 Mask2[i] = Idx; 6011 } 6012 } 6013 6014 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6015 } else if (NumLo == 3 || NumHi == 3) { 6016 // Otherwise, we must have three elements from one vector, call it X, and 6017 // one element from the other, call it Y. First, use a shufps to build an 6018 // intermediate vector with the one element from Y and the element from X 6019 // that will be in the same half in the final destination (the indexes don't 6020 // matter). Then, use a shufps to build the final vector, taking the half 6021 // containing the element from Y from the intermediate, and the other half 6022 // from X. 6023 if (NumHi == 3) { 6024 // Normalize it so the 3 elements come from V1. 6025 CommuteVectorShuffleMask(PermMask, VT); 6026 std::swap(V1, V2); 6027 } 6028 6029 // Find the element from V2. 6030 unsigned HiIndex; 6031 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6032 int Val = PermMask[HiIndex]; 6033 if (Val < 0) 6034 continue; 6035 if (Val >= 4) 6036 break; 6037 } 6038 6039 Mask1[0] = PermMask[HiIndex]; 6040 Mask1[1] = -1; 6041 Mask1[2] = PermMask[HiIndex^1]; 6042 Mask1[3] = -1; 6043 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6044 6045 if (HiIndex >= 2) { 6046 Mask1[0] = PermMask[0]; 6047 Mask1[1] = PermMask[1]; 6048 Mask1[2] = HiIndex & 1 ? 6 : 4; 6049 Mask1[3] = HiIndex & 1 ? 4 : 6; 6050 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6051 } else { 6052 Mask1[0] = HiIndex & 1 ? 2 : 0; 6053 Mask1[1] = HiIndex & 1 ? 0 : 2; 6054 Mask1[2] = PermMask[2]; 6055 Mask1[3] = PermMask[3]; 6056 if (Mask1[2] >= 0) 6057 Mask1[2] += 4; 6058 if (Mask1[3] >= 0) 6059 Mask1[3] += 4; 6060 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6061 } 6062 } 6063 6064 // Break it into (shuffle shuffle_hi, shuffle_lo). 6065 Locs.clear(); 6066 Locs.resize(4); 6067 SmallVector<int,8> LoMask(4U, -1); 6068 SmallVector<int,8> HiMask(4U, -1); 6069 6070 SmallVector<int,8> *MaskPtr = &LoMask; 6071 unsigned MaskIdx = 0; 6072 unsigned LoIdx = 0; 6073 unsigned HiIdx = 2; 6074 for (unsigned i = 0; i != 4; ++i) { 6075 if (i == 2) { 6076 MaskPtr = &HiMask; 6077 MaskIdx = 1; 6078 LoIdx = 0; 6079 HiIdx = 2; 6080 } 6081 int Idx = PermMask[i]; 6082 if (Idx < 0) { 6083 Locs[i] = std::make_pair(-1, -1); 6084 } else if (Idx < 4) { 6085 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6086 (*MaskPtr)[LoIdx] = Idx; 6087 LoIdx++; 6088 } else { 6089 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6090 (*MaskPtr)[HiIdx] = Idx; 6091 HiIdx++; 6092 } 6093 } 6094 6095 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6096 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6097 SmallVector<int, 8> MaskOps; 6098 for (unsigned i = 0; i != 4; ++i) { 6099 if (Locs[i].first == -1) { 6100 MaskOps.push_back(-1); 6101 } else { 6102 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 6103 MaskOps.push_back(Idx); 6104 } 6105 } 6106 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6107} 6108 6109static bool MayFoldVectorLoad(SDValue V) { 6110 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6111 V = V.getOperand(0); 6112 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6113 V = V.getOperand(0); 6114 if (MayFoldLoad(V)) 6115 return true; 6116 return false; 6117} 6118 6119// FIXME: the version above should always be used. Since there's 6120// a bug where several vector shuffles can't be folded because the 6121// DAG is not updated during lowering and a node claims to have two 6122// uses while it only has one, use this version, and let isel match 6123// another instruction if the load really happens to have more than 6124// one use. Remove this version after this bug get fixed. 6125// rdar://8434668, PR8156 6126static bool RelaxedMayFoldVectorLoad(SDValue V) { 6127 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6128 V = V.getOperand(0); 6129 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6130 V = V.getOperand(0); 6131 if (ISD::isNormalLoad(V.getNode())) 6132 return true; 6133 return false; 6134} 6135 6136/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 6137/// a vector extract, and if both can be later optimized into a single load. 6138/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 6139/// here because otherwise a target specific shuffle node is going to be 6140/// emitted for this shuffle, and the optimization not done. 6141/// FIXME: This is probably not the best approach, but fix the problem 6142/// until the right path is decided. 6143static 6144bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 6145 const TargetLowering &TLI) { 6146 EVT VT = V.getValueType(); 6147 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 6148 6149 // Be sure that the vector shuffle is present in a pattern like this: 6150 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 6151 if (!V.hasOneUse()) 6152 return false; 6153 6154 SDNode *N = *V.getNode()->use_begin(); 6155 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 6156 return false; 6157 6158 SDValue EltNo = N->getOperand(1); 6159 if (!isa<ConstantSDNode>(EltNo)) 6160 return false; 6161 6162 // If the bit convert changed the number of elements, it is unsafe 6163 // to examine the mask. 6164 bool HasShuffleIntoBitcast = false; 6165 if (V.getOpcode() == ISD::BITCAST) { 6166 EVT SrcVT = V.getOperand(0).getValueType(); 6167 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 6168 return false; 6169 V = V.getOperand(0); 6170 HasShuffleIntoBitcast = true; 6171 } 6172 6173 // Select the input vector, guarding against out of range extract vector. 6174 unsigned NumElems = VT.getVectorNumElements(); 6175 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 6176 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 6177 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 6178 6179 // Skip one more bit_convert if necessary 6180 if (V.getOpcode() == ISD::BITCAST) 6181 V = V.getOperand(0); 6182 6183 if (ISD::isNormalLoad(V.getNode())) { 6184 // Is the original load suitable? 6185 LoadSDNode *LN0 = cast<LoadSDNode>(V); 6186 6187 // FIXME: avoid the multi-use bug that is preventing lots of 6188 // of foldings to be detected, this is still wrong of course, but 6189 // give the temporary desired behavior, and if it happens that 6190 // the load has real more uses, during isel it will not fold, and 6191 // will generate poor code. 6192 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 6193 return false; 6194 6195 if (!HasShuffleIntoBitcast) 6196 return true; 6197 6198 // If there's a bitcast before the shuffle, check if the load type and 6199 // alignment is valid. 6200 unsigned Align = LN0->getAlignment(); 6201 unsigned NewAlign = 6202 TLI.getTargetData()->getABITypeAlignment( 6203 VT.getTypeForEVT(*DAG.getContext())); 6204 6205 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 6206 return false; 6207 } 6208 6209 return true; 6210} 6211 6212static 6213SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6214 EVT VT = Op.getValueType(); 6215 6216 // Canonizalize to v2f64. 6217 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6218 return DAG.getNode(ISD::BITCAST, dl, VT, 6219 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6220 V1, DAG)); 6221} 6222 6223static 6224SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6225 bool HasSSE2) { 6226 SDValue V1 = Op.getOperand(0); 6227 SDValue V2 = Op.getOperand(1); 6228 EVT VT = Op.getValueType(); 6229 6230 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6231 6232 if (HasSSE2 && VT == MVT::v2f64) 6233 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6234 6235 // v4f32 or v4i32 6236 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 6237} 6238 6239static 6240SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6241 SDValue V1 = Op.getOperand(0); 6242 SDValue V2 = Op.getOperand(1); 6243 EVT VT = Op.getValueType(); 6244 6245 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6246 "unsupported shuffle type"); 6247 6248 if (V2.getOpcode() == ISD::UNDEF) 6249 V2 = V1; 6250 6251 // v4i32 or v4f32 6252 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6253} 6254 6255static inline unsigned getSHUFPOpcode(EVT VT) { 6256 switch(VT.getSimpleVT().SimpleTy) { 6257 case MVT::v8i32: // Use fp unit for int unpack. 6258 case MVT::v8f32: 6259 case MVT::v4i32: // Use fp unit for int unpack. 6260 case MVT::v4f32: return X86ISD::SHUFPS; 6261 case MVT::v4i64: // Use fp unit for int unpack. 6262 case MVT::v4f64: 6263 case MVT::v2i64: // Use fp unit for int unpack. 6264 case MVT::v2f64: return X86ISD::SHUFPD; 6265 default: 6266 llvm_unreachable("Unknown type for shufp*"); 6267 } 6268 return 0; 6269} 6270 6271static 6272SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6273 SDValue V1 = Op.getOperand(0); 6274 SDValue V2 = Op.getOperand(1); 6275 EVT VT = Op.getValueType(); 6276 unsigned NumElems = VT.getVectorNumElements(); 6277 6278 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6279 // operand of these instructions is only memory, so check if there's a 6280 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6281 // same masks. 6282 bool CanFoldLoad = false; 6283 6284 // Trivial case, when V2 comes from a load. 6285 if (MayFoldVectorLoad(V2)) 6286 CanFoldLoad = true; 6287 6288 // When V1 is a load, it can be folded later into a store in isel, example: 6289 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6290 // turns into: 6291 // (MOVLPSmr addr:$src1, VR128:$src2) 6292 // So, recognize this potential and also use MOVLPS or MOVLPD 6293 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6294 CanFoldLoad = true; 6295 6296 // Both of them can't be memory operations though. 6297 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 6298 CanFoldLoad = false; 6299 6300 if (CanFoldLoad) { 6301 if (HasSSE2 && NumElems == 2) 6302 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6303 6304 if (NumElems == 4) 6305 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6306 } 6307 6308 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6309 // movl and movlp will both match v2i64, but v2i64 is never matched by 6310 // movl earlier because we make it strict to avoid messing with the movlp load 6311 // folding logic (see the code above getMOVLP call). Match it here then, 6312 // this is horrible, but will stay like this until we move all shuffle 6313 // matching to x86 specific nodes. Note that for the 1st condition all 6314 // types are matched with movsd. 6315 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 6316 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6317 else if (HasSSE2) 6318 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6319 6320 6321 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6322 6323 // Invert the operand order and use SHUFPS to match it. 6324 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, 6325 X86::getShuffleSHUFImmediate(SVOp), DAG); 6326} 6327 6328static inline unsigned getUNPCKLOpcode(EVT VT) { 6329 switch(VT.getSimpleVT().SimpleTy) { 6330 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 6331 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 6332 case MVT::v4f32: return X86ISD::UNPCKLPS; 6333 case MVT::v2f64: return X86ISD::UNPCKLPD; 6334 case MVT::v8i32: // Use fp unit for int unpack. 6335 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 6336 case MVT::v4i64: // Use fp unit for int unpack. 6337 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 6338 case MVT::v16i8: return X86ISD::PUNPCKLBW; 6339 case MVT::v8i16: return X86ISD::PUNPCKLWD; 6340 default: 6341 llvm_unreachable("Unknown type for unpckl"); 6342 } 6343 return 0; 6344} 6345 6346static inline unsigned getUNPCKHOpcode(EVT VT) { 6347 switch(VT.getSimpleVT().SimpleTy) { 6348 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 6349 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 6350 case MVT::v4f32: return X86ISD::UNPCKHPS; 6351 case MVT::v2f64: return X86ISD::UNPCKHPD; 6352 case MVT::v8i32: // Use fp unit for int unpack. 6353 case MVT::v8f32: return X86ISD::VUNPCKHPSY; 6354 case MVT::v4i64: // Use fp unit for int unpack. 6355 case MVT::v4f64: return X86ISD::VUNPCKHPDY; 6356 case MVT::v16i8: return X86ISD::PUNPCKHBW; 6357 case MVT::v8i16: return X86ISD::PUNPCKHWD; 6358 default: 6359 llvm_unreachable("Unknown type for unpckh"); 6360 } 6361 return 0; 6362} 6363 6364static inline unsigned getVPERMILOpcode(EVT VT) { 6365 switch(VT.getSimpleVT().SimpleTy) { 6366 case MVT::v4i32: 6367 case MVT::v4f32: return X86ISD::VPERMILPS; 6368 case MVT::v2i64: 6369 case MVT::v2f64: return X86ISD::VPERMILPD; 6370 case MVT::v8i32: 6371 case MVT::v8f32: return X86ISD::VPERMILPSY; 6372 case MVT::v4i64: 6373 case MVT::v4f64: return X86ISD::VPERMILPDY; 6374 default: 6375 llvm_unreachable("Unknown type for vpermil"); 6376 } 6377 return 0; 6378} 6379 6380/// isVectorBroadcast - Check if the node chain is suitable to be xformed to 6381/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming 6382/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. 6383static bool isVectorBroadcast(SDValue &Op) { 6384 EVT VT = Op.getValueType(); 6385 bool Is256 = VT.getSizeInBits() == 256; 6386 6387 assert((VT.getSizeInBits() == 128 || Is256) && 6388 "Unsupported type for vbroadcast node"); 6389 6390 SDValue V = Op; 6391 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6392 V = V.getOperand(0); 6393 6394 if (Is256 && !(V.hasOneUse() && 6395 V.getOpcode() == ISD::INSERT_SUBVECTOR && 6396 V.getOperand(0).getOpcode() == ISD::UNDEF)) 6397 return false; 6398 6399 if (Is256) 6400 V = V.getOperand(1); 6401 if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR) 6402 return false; 6403 6404 // Check the source scalar_to_vector type. 256-bit broadcasts are 6405 // supported for 32/64-bit sizes, while 128-bit ones are only supported 6406 // for 32-bit scalars. 6407 unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); 6408 if (ScalarSize != 32 && ScalarSize != 64) 6409 return false; 6410 if (!Is256 && ScalarSize == 64) 6411 return false; 6412 6413 V = V.getOperand(0); 6414 if (!MayFoldLoad(V)) 6415 return false; 6416 6417 // Return the load node 6418 Op = V; 6419 return true; 6420} 6421 6422static 6423SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 6424 const TargetLowering &TLI, 6425 const X86Subtarget *Subtarget) { 6426 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6427 EVT VT = Op.getValueType(); 6428 DebugLoc dl = Op.getDebugLoc(); 6429 SDValue V1 = Op.getOperand(0); 6430 SDValue V2 = Op.getOperand(1); 6431 6432 if (isZeroShuffle(SVOp)) 6433 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 6434 6435 // Handle splat operations 6436 if (SVOp->isSplat()) { 6437 unsigned NumElem = VT.getVectorNumElements(); 6438 int Size = VT.getSizeInBits(); 6439 // Special case, this is the only place now where it's allowed to return 6440 // a vector_shuffle operation without using a target specific node, because 6441 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 6442 // this be moved to DAGCombine instead? 6443 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 6444 return Op; 6445 6446 // Use vbroadcast whenever the splat comes from a foldable load 6447 if (Subtarget->hasAVX() && isVectorBroadcast(V1)) 6448 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); 6449 6450 // Handle splats by matching through known shuffle masks 6451 if ((Size == 128 && NumElem <= 4) || 6452 (Size == 256 && NumElem < 8)) 6453 return SDValue(); 6454 6455 // All remaning splats are promoted to target supported vector shuffles. 6456 return PromoteSplat(SVOp, DAG); 6457 } 6458 6459 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6460 // do it! 6461 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 6462 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6463 if (NewOp.getNode()) 6464 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6465 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6466 // FIXME: Figure out a cleaner way to do this. 6467 // Try to make use of movq to zero out the top part. 6468 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6469 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6470 if (NewOp.getNode()) { 6471 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 6472 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 6473 DAG, Subtarget, dl); 6474 } 6475 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6476 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6477 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 6478 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 6479 DAG, Subtarget, dl); 6480 } 6481 } 6482 return SDValue(); 6483} 6484 6485SDValue 6486X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6487 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6488 SDValue V1 = Op.getOperand(0); 6489 SDValue V2 = Op.getOperand(1); 6490 EVT VT = Op.getValueType(); 6491 DebugLoc dl = Op.getDebugLoc(); 6492 unsigned NumElems = VT.getVectorNumElements(); 6493 bool isMMX = VT.getSizeInBits() == 64; 6494 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6495 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6496 bool V1IsSplat = false; 6497 bool V2IsSplat = false; 6498 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 6499 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 6500 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 6501 MachineFunction &MF = DAG.getMachineFunction(); 6502 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6503 6504 // Shuffle operations on MMX not supported. 6505 if (isMMX) 6506 return Op; 6507 6508 // Vector shuffle lowering takes 3 steps: 6509 // 6510 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6511 // narrowing and commutation of operands should be handled. 6512 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6513 // shuffle nodes. 6514 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6515 // so the shuffle can be broken into other shuffles and the legalizer can 6516 // try the lowering again. 6517 // 6518 // The general ideia is that no vector_shuffle operation should be left to 6519 // be matched during isel, all of them must be converted to a target specific 6520 // node here. 6521 6522 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6523 // narrowing and commutation of operands should be handled. The actual code 6524 // doesn't include all of those, work in progress... 6525 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 6526 if (NewOp.getNode()) 6527 return NewOp; 6528 6529 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6530 // unpckh_undef). Only use pshufd if speed is more important than size. 6531 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 6532 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6533 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 6534 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6535 6536 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 6537 RelaxedMayFoldVectorLoad(V1)) 6538 return getMOVDDup(Op, dl, V1, DAG); 6539 6540 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 6541 return getMOVHighToLow(Op, dl, DAG); 6542 6543 // Use to match splats 6544 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 6545 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6546 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6547 6548 if (X86::isPSHUFDMask(SVOp)) { 6549 // The actual implementation will match the mask in the if above and then 6550 // during isel it can match several different instructions, not only pshufd 6551 // as its name says, sad but true, emulate the behavior for now... 6552 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6553 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6554 6555 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6556 6557 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6558 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6559 6560 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, 6561 TargetMask, DAG); 6562 } 6563 6564 // Check if this can be converted into a logical shift. 6565 bool isLeft = false; 6566 unsigned ShAmt = 0; 6567 SDValue ShVal; 6568 bool isShift = getSubtarget()->hasSSE2() && 6569 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6570 if (isShift && ShVal.hasOneUse()) { 6571 // If the shifted value has multiple uses, it may be cheaper to use 6572 // v_set0 + movlhps or movhlps, etc. 6573 EVT EltVT = VT.getVectorElementType(); 6574 ShAmt *= EltVT.getSizeInBits(); 6575 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6576 } 6577 6578 if (X86::isMOVLMask(SVOp)) { 6579 if (V1IsUndef) 6580 return V2; 6581 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6582 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6583 if (!X86::isMOVLPMask(SVOp)) { 6584 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6585 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6586 6587 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6588 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6589 } 6590 } 6591 6592 // FIXME: fold these into legal mask. 6593 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 6594 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6595 6596 if (X86::isMOVHLPSMask(SVOp)) 6597 return getMOVHighToLow(Op, dl, DAG); 6598 6599 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 6600 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6601 6602 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 6603 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6604 6605 if (X86::isMOVLPMask(SVOp)) 6606 return getMOVLP(Op, dl, DAG, HasSSE2); 6607 6608 if (ShouldXformToMOVHLPS(SVOp) || 6609 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 6610 return CommuteVectorShuffle(SVOp, DAG); 6611 6612 if (isShift) { 6613 // No better options. Use a vshl / vsrl. 6614 EVT EltVT = VT.getVectorElementType(); 6615 ShAmt *= EltVT.getSizeInBits(); 6616 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6617 } 6618 6619 bool Commuted = false; 6620 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6621 // 1,1,1,1 -> v8i16 though. 6622 V1IsSplat = isSplatVector(V1.getNode()); 6623 V2IsSplat = isSplatVector(V2.getNode()); 6624 6625 // Canonicalize the splat or undef, if present, to be on the RHS. 6626 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 6627 Op = CommuteVectorShuffle(SVOp, DAG); 6628 SVOp = cast<ShuffleVectorSDNode>(Op); 6629 V1 = SVOp->getOperand(0); 6630 V2 = SVOp->getOperand(1); 6631 std::swap(V1IsSplat, V2IsSplat); 6632 std::swap(V1IsUndef, V2IsUndef); 6633 Commuted = true; 6634 } 6635 6636 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 6637 // Shuffling low element of v1 into undef, just return v1. 6638 if (V2IsUndef) 6639 return V1; 6640 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6641 // the instruction selector will not match, so get a canonical MOVL with 6642 // swapped operands to undo the commute. 6643 return getMOVL(DAG, dl, VT, V2, V1); 6644 } 6645 6646 if (X86::isUNPCKLMask(SVOp)) 6647 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 6648 6649 if (X86::isUNPCKHMask(SVOp)) 6650 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 6651 6652 if (V2IsSplat) { 6653 // Normalize mask so all entries that point to V2 points to its first 6654 // element then try to match unpck{h|l} again. If match, return a 6655 // new vector_shuffle with the corrected mask. 6656 SDValue NewMask = NormalizeMask(SVOp, DAG); 6657 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6658 if (NSVOp != SVOp) { 6659 if (X86::isUNPCKLMask(NSVOp, true)) { 6660 return NewMask; 6661 } else if (X86::isUNPCKHMask(NSVOp, true)) { 6662 return NewMask; 6663 } 6664 } 6665 } 6666 6667 if (Commuted) { 6668 // Commute is back and try unpck* again. 6669 // FIXME: this seems wrong. 6670 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6671 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6672 6673 if (X86::isUNPCKLMask(NewSVOp)) 6674 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 6675 6676 if (X86::isUNPCKHMask(NewSVOp)) 6677 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 6678 } 6679 6680 // Normalize the node to match x86 shuffle ops if needed 6681 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 6682 return CommuteVectorShuffle(SVOp, DAG); 6683 6684 // The checks below are all present in isShuffleMaskLegal, but they are 6685 // inlined here right now to enable us to directly emit target specific 6686 // nodes, and remove one by one until they don't return Op anymore. 6687 SmallVector<int, 16> M; 6688 SVOp->getMask(M); 6689 6690 if (isPALIGNRMask(M, VT, HasSSSE3)) 6691 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6692 X86::getShufflePALIGNRImmediate(SVOp), 6693 DAG); 6694 6695 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6696 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6697 if (VT == MVT::v2f64) 6698 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 6699 if (VT == MVT::v2i64) 6700 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 6701 } 6702 6703 if (isPSHUFHWMask(M, VT)) 6704 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6705 X86::getShufflePSHUFHWImmediate(SVOp), 6706 DAG); 6707 6708 if (isPSHUFLWMask(M, VT)) 6709 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6710 X86::getShufflePSHUFLWImmediate(SVOp), 6711 DAG); 6712 6713 if (isSHUFPMask(M, VT)) 6714 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, 6715 X86::getShuffleSHUFImmediate(SVOp), DAG); 6716 6717 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 6718 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6719 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 6720 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6721 6722 //===--------------------------------------------------------------------===// 6723 // Generate target specific nodes for 128 or 256-bit shuffles only 6724 // supported in the AVX instruction set. 6725 // 6726 6727 // Handle VMOVDDUPY permutations 6728 if (isMOVDDUPYMask(SVOp, Subtarget)) 6729 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6730 6731 // Handle VPERMILPS* permutations 6732 if (isVPERMILPSMask(M, VT, Subtarget)) 6733 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6734 getShuffleVPERMILPSImmediate(SVOp), DAG); 6735 6736 // Handle VPERMILPD* permutations 6737 if (isVPERMILPDMask(M, VT, Subtarget)) 6738 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6739 getShuffleVPERMILPDImmediate(SVOp), DAG); 6740 6741 // Handle VPERM2F128 permutations 6742 if (isVPERM2F128Mask(M, VT, Subtarget)) 6743 return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, 6744 getShuffleVPERM2F128Immediate(SVOp), DAG); 6745 6746 // Handle VSHUFPSY permutations 6747 if (isVSHUFPSYMask(M, VT, Subtarget)) 6748 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, 6749 getShuffleVSHUFPSYImmediate(SVOp), DAG); 6750 6751 // Handle VSHUFPDY permutations 6752 if (isVSHUFPDYMask(M, VT, Subtarget)) 6753 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, 6754 getShuffleVSHUFPDYImmediate(SVOp), DAG); 6755 6756 //===--------------------------------------------------------------------===// 6757 // Since no target specific shuffle was selected for this generic one, 6758 // lower it into other known shuffles. FIXME: this isn't true yet, but 6759 // this is the plan. 6760 // 6761 6762 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6763 if (VT == MVT::v8i16) { 6764 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6765 if (NewOp.getNode()) 6766 return NewOp; 6767 } 6768 6769 if (VT == MVT::v16i8) { 6770 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6771 if (NewOp.getNode()) 6772 return NewOp; 6773 } 6774 6775 // Handle all 128-bit wide vectors with 4 elements, and match them with 6776 // several different shuffle types. 6777 if (NumElems == 4 && VT.getSizeInBits() == 128) 6778 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6779 6780 // Handle general 256-bit shuffles 6781 if (VT.is256BitVector()) 6782 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6783 6784 return SDValue(); 6785} 6786 6787SDValue 6788X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6789 SelectionDAG &DAG) const { 6790 EVT VT = Op.getValueType(); 6791 DebugLoc dl = Op.getDebugLoc(); 6792 6793 if (Op.getOperand(0).getValueType().getSizeInBits() != 128) 6794 return SDValue(); 6795 6796 if (VT.getSizeInBits() == 8) { 6797 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6798 Op.getOperand(0), Op.getOperand(1)); 6799 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6800 DAG.getValueType(VT)); 6801 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6802 } else if (VT.getSizeInBits() == 16) { 6803 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6804 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6805 if (Idx == 0) 6806 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6807 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6808 DAG.getNode(ISD::BITCAST, dl, 6809 MVT::v4i32, 6810 Op.getOperand(0)), 6811 Op.getOperand(1))); 6812 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6813 Op.getOperand(0), Op.getOperand(1)); 6814 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6815 DAG.getValueType(VT)); 6816 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6817 } else if (VT == MVT::f32) { 6818 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6819 // the result back to FR32 register. It's only worth matching if the 6820 // result has a single use which is a store or a bitcast to i32. And in 6821 // the case of a store, it's not worth it if the index is a constant 0, 6822 // because a MOVSSmr can be used instead, which is smaller and faster. 6823 if (!Op.hasOneUse()) 6824 return SDValue(); 6825 SDNode *User = *Op.getNode()->use_begin(); 6826 if ((User->getOpcode() != ISD::STORE || 6827 (isa<ConstantSDNode>(Op.getOperand(1)) && 6828 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6829 (User->getOpcode() != ISD::BITCAST || 6830 User->getValueType(0) != MVT::i32)) 6831 return SDValue(); 6832 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6833 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6834 Op.getOperand(0)), 6835 Op.getOperand(1)); 6836 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6837 } else if (VT == MVT::i32) { 6838 // ExtractPS works with constant index. 6839 if (isa<ConstantSDNode>(Op.getOperand(1))) 6840 return Op; 6841 } 6842 return SDValue(); 6843} 6844 6845 6846SDValue 6847X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6848 SelectionDAG &DAG) const { 6849 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6850 return SDValue(); 6851 6852 SDValue Vec = Op.getOperand(0); 6853 EVT VecVT = Vec.getValueType(); 6854 6855 // If this is a 256-bit vector result, first extract the 128-bit vector and 6856 // then extract the element from the 128-bit vector. 6857 if (VecVT.getSizeInBits() == 256) { 6858 DebugLoc dl = Op.getNode()->getDebugLoc(); 6859 unsigned NumElems = VecVT.getVectorNumElements(); 6860 SDValue Idx = Op.getOperand(1); 6861 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6862 6863 // Get the 128-bit vector. 6864 bool Upper = IdxVal >= NumElems/2; 6865 Vec = Extract128BitVector(Vec, 6866 DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); 6867 6868 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6869 Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); 6870 } 6871 6872 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6873 6874 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 6875 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6876 if (Res.getNode()) 6877 return Res; 6878 } 6879 6880 EVT VT = Op.getValueType(); 6881 DebugLoc dl = Op.getDebugLoc(); 6882 // TODO: handle v16i8. 6883 if (VT.getSizeInBits() == 16) { 6884 SDValue Vec = Op.getOperand(0); 6885 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6886 if (Idx == 0) 6887 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6888 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6889 DAG.getNode(ISD::BITCAST, dl, 6890 MVT::v4i32, Vec), 6891 Op.getOperand(1))); 6892 // Transform it so it match pextrw which produces a 32-bit result. 6893 EVT EltVT = MVT::i32; 6894 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6895 Op.getOperand(0), Op.getOperand(1)); 6896 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6897 DAG.getValueType(VT)); 6898 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6899 } else if (VT.getSizeInBits() == 32) { 6900 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6901 if (Idx == 0) 6902 return Op; 6903 6904 // SHUFPS the element to the lowest double word, then movss. 6905 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6906 EVT VVT = Op.getOperand(0).getValueType(); 6907 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6908 DAG.getUNDEF(VVT), Mask); 6909 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6910 DAG.getIntPtrConstant(0)); 6911 } else if (VT.getSizeInBits() == 64) { 6912 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6913 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6914 // to match extract_elt for f64. 6915 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6916 if (Idx == 0) 6917 return Op; 6918 6919 // UNPCKHPD the element to the lowest double word, then movsd. 6920 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6921 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6922 int Mask[2] = { 1, -1 }; 6923 EVT VVT = Op.getOperand(0).getValueType(); 6924 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6925 DAG.getUNDEF(VVT), Mask); 6926 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6927 DAG.getIntPtrConstant(0)); 6928 } 6929 6930 return SDValue(); 6931} 6932 6933SDValue 6934X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6935 SelectionDAG &DAG) const { 6936 EVT VT = Op.getValueType(); 6937 EVT EltVT = VT.getVectorElementType(); 6938 DebugLoc dl = Op.getDebugLoc(); 6939 6940 SDValue N0 = Op.getOperand(0); 6941 SDValue N1 = Op.getOperand(1); 6942 SDValue N2 = Op.getOperand(2); 6943 6944 if (VT.getSizeInBits() == 256) 6945 return SDValue(); 6946 6947 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6948 isa<ConstantSDNode>(N2)) { 6949 unsigned Opc; 6950 if (VT == MVT::v8i16) 6951 Opc = X86ISD::PINSRW; 6952 else if (VT == MVT::v16i8) 6953 Opc = X86ISD::PINSRB; 6954 else 6955 Opc = X86ISD::PINSRB; 6956 6957 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6958 // argument. 6959 if (N1.getValueType() != MVT::i32) 6960 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6961 if (N2.getValueType() != MVT::i32) 6962 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6963 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6964 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6965 // Bits [7:6] of the constant are the source select. This will always be 6966 // zero here. The DAG Combiner may combine an extract_elt index into these 6967 // bits. For example (insert (extract, 3), 2) could be matched by putting 6968 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6969 // Bits [5:4] of the constant are the destination select. This is the 6970 // value of the incoming immediate. 6971 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6972 // combine either bitwise AND or insert of float 0.0 to set these bits. 6973 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6974 // Create this as a scalar to vector.. 6975 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6976 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6977 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6978 // PINSR* works with constant index. 6979 return Op; 6980 } 6981 return SDValue(); 6982} 6983 6984SDValue 6985X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6986 EVT VT = Op.getValueType(); 6987 EVT EltVT = VT.getVectorElementType(); 6988 6989 DebugLoc dl = Op.getDebugLoc(); 6990 SDValue N0 = Op.getOperand(0); 6991 SDValue N1 = Op.getOperand(1); 6992 SDValue N2 = Op.getOperand(2); 6993 6994 // If this is a 256-bit vector result, first extract the 128-bit vector, 6995 // insert the element into the extracted half and then place it back. 6996 if (VT.getSizeInBits() == 256) { 6997 if (!isa<ConstantSDNode>(N2)) 6998 return SDValue(); 6999 7000 // Get the desired 128-bit vector half. 7001 unsigned NumElems = VT.getVectorNumElements(); 7002 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7003 bool Upper = IdxVal >= NumElems/2; 7004 SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); 7005 SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); 7006 7007 // Insert the element into the desired half. 7008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, 7009 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); 7010 7011 // Insert the changed part back to the 256-bit vector 7012 return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); 7013 } 7014 7015 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) 7016 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7017 7018 if (EltVT == MVT::i8) 7019 return SDValue(); 7020 7021 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7022 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7023 // as its second argument. 7024 if (N1.getValueType() != MVT::i32) 7025 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7026 if (N2.getValueType() != MVT::i32) 7027 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7028 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7029 } 7030 return SDValue(); 7031} 7032 7033SDValue 7034X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 7035 LLVMContext *Context = DAG.getContext(); 7036 DebugLoc dl = Op.getDebugLoc(); 7037 EVT OpVT = Op.getValueType(); 7038 7039 // If this is a 256-bit vector result, first insert into a 128-bit 7040 // vector and then insert into the 256-bit vector. 7041 if (OpVT.getSizeInBits() > 128) { 7042 // Insert into a 128-bit vector. 7043 EVT VT128 = EVT::getVectorVT(*Context, 7044 OpVT.getVectorElementType(), 7045 OpVT.getVectorNumElements() / 2); 7046 7047 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7048 7049 // Insert the 128-bit vector. 7050 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 7051 DAG.getConstant(0, MVT::i32), 7052 DAG, dl); 7053 } 7054 7055 if (Op.getValueType() == MVT::v1i64 && 7056 Op.getOperand(0).getValueType() == MVT::i64) 7057 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7058 7059 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7060 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 7061 "Expected an SSE type!"); 7062 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 7063 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7064} 7065 7066// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7067// a simple subregister reference or explicit instructions to grab 7068// upper bits of a vector. 7069SDValue 7070X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7071 if (Subtarget->hasAVX()) { 7072 DebugLoc dl = Op.getNode()->getDebugLoc(); 7073 SDValue Vec = Op.getNode()->getOperand(0); 7074 SDValue Idx = Op.getNode()->getOperand(1); 7075 7076 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 7077 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 7078 return Extract128BitVector(Vec, Idx, DAG, dl); 7079 } 7080 } 7081 return SDValue(); 7082} 7083 7084// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7085// simple superregister reference or explicit instructions to insert 7086// the upper bits of a vector. 7087SDValue 7088X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7089 if (Subtarget->hasAVX()) { 7090 DebugLoc dl = Op.getNode()->getDebugLoc(); 7091 SDValue Vec = Op.getNode()->getOperand(0); 7092 SDValue SubVec = Op.getNode()->getOperand(1); 7093 SDValue Idx = Op.getNode()->getOperand(2); 7094 7095 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 7096 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 7097 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 7098 } 7099 } 7100 return SDValue(); 7101} 7102 7103// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7104// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7105// one of the above mentioned nodes. It has to be wrapped because otherwise 7106// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7107// be used to form addressing mode. These wrapped nodes will be selected 7108// into MOV32ri. 7109SDValue 7110X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7111 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7112 7113 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7114 // global base reg. 7115 unsigned char OpFlag = 0; 7116 unsigned WrapperKind = X86ISD::Wrapper; 7117 CodeModel::Model M = getTargetMachine().getCodeModel(); 7118 7119 if (Subtarget->isPICStyleRIPRel() && 7120 (M == CodeModel::Small || M == CodeModel::Kernel)) 7121 WrapperKind = X86ISD::WrapperRIP; 7122 else if (Subtarget->isPICStyleGOT()) 7123 OpFlag = X86II::MO_GOTOFF; 7124 else if (Subtarget->isPICStyleStubPIC()) 7125 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7126 7127 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7128 CP->getAlignment(), 7129 CP->getOffset(), OpFlag); 7130 DebugLoc DL = CP->getDebugLoc(); 7131 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7132 // With PIC, the address is actually $g + Offset. 7133 if (OpFlag) { 7134 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7135 DAG.getNode(X86ISD::GlobalBaseReg, 7136 DebugLoc(), getPointerTy()), 7137 Result); 7138 } 7139 7140 return Result; 7141} 7142 7143SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7144 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7145 7146 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7147 // global base reg. 7148 unsigned char OpFlag = 0; 7149 unsigned WrapperKind = X86ISD::Wrapper; 7150 CodeModel::Model M = getTargetMachine().getCodeModel(); 7151 7152 if (Subtarget->isPICStyleRIPRel() && 7153 (M == CodeModel::Small || M == CodeModel::Kernel)) 7154 WrapperKind = X86ISD::WrapperRIP; 7155 else if (Subtarget->isPICStyleGOT()) 7156 OpFlag = X86II::MO_GOTOFF; 7157 else if (Subtarget->isPICStyleStubPIC()) 7158 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7159 7160 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7161 OpFlag); 7162 DebugLoc DL = JT->getDebugLoc(); 7163 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7164 7165 // With PIC, the address is actually $g + Offset. 7166 if (OpFlag) 7167 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7168 DAG.getNode(X86ISD::GlobalBaseReg, 7169 DebugLoc(), getPointerTy()), 7170 Result); 7171 7172 return Result; 7173} 7174 7175SDValue 7176X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7177 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7178 7179 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7180 // global base reg. 7181 unsigned char OpFlag = 0; 7182 unsigned WrapperKind = X86ISD::Wrapper; 7183 CodeModel::Model M = getTargetMachine().getCodeModel(); 7184 7185 if (Subtarget->isPICStyleRIPRel() && 7186 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7187 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7188 OpFlag = X86II::MO_GOTPCREL; 7189 WrapperKind = X86ISD::WrapperRIP; 7190 } else if (Subtarget->isPICStyleGOT()) { 7191 OpFlag = X86II::MO_GOT; 7192 } else if (Subtarget->isPICStyleStubPIC()) { 7193 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7194 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7195 OpFlag = X86II::MO_DARWIN_NONLAZY; 7196 } 7197 7198 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7199 7200 DebugLoc DL = Op.getDebugLoc(); 7201 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7202 7203 7204 // With PIC, the address is actually $g + Offset. 7205 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7206 !Subtarget->is64Bit()) { 7207 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7208 DAG.getNode(X86ISD::GlobalBaseReg, 7209 DebugLoc(), getPointerTy()), 7210 Result); 7211 } 7212 7213 // For symbols that require a load from a stub to get the address, emit the 7214 // load. 7215 if (isGlobalStubReference(OpFlag)) 7216 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7217 MachinePointerInfo::getGOT(), false, false, 0); 7218 7219 return Result; 7220} 7221 7222SDValue 7223X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7224 // Create the TargetBlockAddressAddress node. 7225 unsigned char OpFlags = 7226 Subtarget->ClassifyBlockAddressReference(); 7227 CodeModel::Model M = getTargetMachine().getCodeModel(); 7228 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7229 DebugLoc dl = Op.getDebugLoc(); 7230 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 7231 /*isTarget=*/true, OpFlags); 7232 7233 if (Subtarget->isPICStyleRIPRel() && 7234 (M == CodeModel::Small || M == CodeModel::Kernel)) 7235 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7236 else 7237 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7238 7239 // With PIC, the address is actually $g + Offset. 7240 if (isGlobalRelativeToPICBase(OpFlags)) { 7241 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7242 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7243 Result); 7244 } 7245 7246 return Result; 7247} 7248 7249SDValue 7250X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7251 int64_t Offset, 7252 SelectionDAG &DAG) const { 7253 // Create the TargetGlobalAddress node, folding in the constant 7254 // offset if it is legal. 7255 unsigned char OpFlags = 7256 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7257 CodeModel::Model M = getTargetMachine().getCodeModel(); 7258 SDValue Result; 7259 if (OpFlags == X86II::MO_NO_FLAG && 7260 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7261 // A direct static reference to a global. 7262 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7263 Offset = 0; 7264 } else { 7265 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7266 } 7267 7268 if (Subtarget->isPICStyleRIPRel() && 7269 (M == CodeModel::Small || M == CodeModel::Kernel)) 7270 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7271 else 7272 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7273 7274 // With PIC, the address is actually $g + Offset. 7275 if (isGlobalRelativeToPICBase(OpFlags)) { 7276 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7277 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7278 Result); 7279 } 7280 7281 // For globals that require a load from a stub to get the address, emit the 7282 // load. 7283 if (isGlobalStubReference(OpFlags)) 7284 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7285 MachinePointerInfo::getGOT(), false, false, 0); 7286 7287 // If there was a non-zero offset that we didn't fold, create an explicit 7288 // addition for it. 7289 if (Offset != 0) 7290 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7291 DAG.getConstant(Offset, getPointerTy())); 7292 7293 return Result; 7294} 7295 7296SDValue 7297X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7298 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7299 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7300 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7301} 7302 7303static SDValue 7304GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7305 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7306 unsigned char OperandFlags) { 7307 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7308 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7309 DebugLoc dl = GA->getDebugLoc(); 7310 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7311 GA->getValueType(0), 7312 GA->getOffset(), 7313 OperandFlags); 7314 if (InFlag) { 7315 SDValue Ops[] = { Chain, TGA, *InFlag }; 7316 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 7317 } else { 7318 SDValue Ops[] = { Chain, TGA }; 7319 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 7320 } 7321 7322 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7323 MFI->setAdjustsStack(true); 7324 7325 SDValue Flag = Chain.getValue(1); 7326 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7327} 7328 7329// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7330static SDValue 7331LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7332 const EVT PtrVT) { 7333 SDValue InFlag; 7334 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7335 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7336 DAG.getNode(X86ISD::GlobalBaseReg, 7337 DebugLoc(), PtrVT), InFlag); 7338 InFlag = Chain.getValue(1); 7339 7340 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7341} 7342 7343// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7344static SDValue 7345LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7346 const EVT PtrVT) { 7347 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7348 X86::RAX, X86II::MO_TLSGD); 7349} 7350 7351// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 7352// "local exec" model. 7353static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7354 const EVT PtrVT, TLSModel::Model model, 7355 bool is64Bit) { 7356 DebugLoc dl = GA->getDebugLoc(); 7357 7358 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7359 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7360 is64Bit ? 257 : 256)); 7361 7362 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7363 DAG.getIntPtrConstant(0), 7364 MachinePointerInfo(Ptr), false, false, 0); 7365 7366 unsigned char OperandFlags = 0; 7367 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7368 // initialexec. 7369 unsigned WrapperKind = X86ISD::Wrapper; 7370 if (model == TLSModel::LocalExec) { 7371 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7372 } else if (is64Bit) { 7373 assert(model == TLSModel::InitialExec); 7374 OperandFlags = X86II::MO_GOTTPOFF; 7375 WrapperKind = X86ISD::WrapperRIP; 7376 } else { 7377 assert(model == TLSModel::InitialExec); 7378 OperandFlags = X86II::MO_INDNTPOFF; 7379 } 7380 7381 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 7382 // exec) 7383 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7384 GA->getValueType(0), 7385 GA->getOffset(), OperandFlags); 7386 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7387 7388 if (model == TLSModel::InitialExec) 7389 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7390 MachinePointerInfo::getGOT(), false, false, 0); 7391 7392 // The address of the thread local variable is the add of the thread 7393 // pointer with the offset of the variable. 7394 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7395} 7396 7397SDValue 7398X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7399 7400 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7401 const GlobalValue *GV = GA->getGlobal(); 7402 7403 if (Subtarget->isTargetELF()) { 7404 // TODO: implement the "local dynamic" model 7405 // TODO: implement the "initial exec"model for pic executables 7406 7407 // If GV is an alias then use the aliasee for determining 7408 // thread-localness. 7409 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7410 GV = GA->resolveAliasedGlobal(false); 7411 7412 TLSModel::Model model 7413 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 7414 7415 switch (model) { 7416 case TLSModel::GeneralDynamic: 7417 case TLSModel::LocalDynamic: // not implemented 7418 if (Subtarget->is64Bit()) 7419 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7420 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7421 7422 case TLSModel::InitialExec: 7423 case TLSModel::LocalExec: 7424 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7425 Subtarget->is64Bit()); 7426 } 7427 } else if (Subtarget->isTargetDarwin()) { 7428 // Darwin only has one model of TLS. Lower to that. 7429 unsigned char OpFlag = 0; 7430 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7431 X86ISD::WrapperRIP : X86ISD::Wrapper; 7432 7433 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7434 // global base reg. 7435 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7436 !Subtarget->is64Bit(); 7437 if (PIC32) 7438 OpFlag = X86II::MO_TLVP_PIC_BASE; 7439 else 7440 OpFlag = X86II::MO_TLVP; 7441 DebugLoc DL = Op.getDebugLoc(); 7442 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7443 GA->getValueType(0), 7444 GA->getOffset(), OpFlag); 7445 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7446 7447 // With PIC32, the address is actually $g + Offset. 7448 if (PIC32) 7449 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7450 DAG.getNode(X86ISD::GlobalBaseReg, 7451 DebugLoc(), getPointerTy()), 7452 Offset); 7453 7454 // Lowering the machine isd will make sure everything is in the right 7455 // location. 7456 SDValue Chain = DAG.getEntryNode(); 7457 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7458 SDValue Args[] = { Chain, Offset }; 7459 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7460 7461 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7462 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7463 MFI->setAdjustsStack(true); 7464 7465 // And our return value (tls address) is in the standard call return value 7466 // location. 7467 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7468 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 7469 } 7470 7471 assert(false && 7472 "TLS not implemented for this target."); 7473 7474 llvm_unreachable("Unreachable"); 7475 return SDValue(); 7476} 7477 7478 7479/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 7480/// take a 2 x i32 value to shift plus a shift amount. 7481SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 7482 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7483 EVT VT = Op.getValueType(); 7484 unsigned VTBits = VT.getSizeInBits(); 7485 DebugLoc dl = Op.getDebugLoc(); 7486 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7487 SDValue ShOpLo = Op.getOperand(0); 7488 SDValue ShOpHi = Op.getOperand(1); 7489 SDValue ShAmt = Op.getOperand(2); 7490 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7491 DAG.getConstant(VTBits - 1, MVT::i8)) 7492 : DAG.getConstant(0, VT); 7493 7494 SDValue Tmp2, Tmp3; 7495 if (Op.getOpcode() == ISD::SHL_PARTS) { 7496 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7497 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7498 } else { 7499 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7500 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7501 } 7502 7503 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7504 DAG.getConstant(VTBits, MVT::i8)); 7505 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7506 AndNode, DAG.getConstant(0, MVT::i8)); 7507 7508 SDValue Hi, Lo; 7509 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7510 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7511 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7512 7513 if (Op.getOpcode() == ISD::SHL_PARTS) { 7514 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7515 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7516 } else { 7517 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7518 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7519 } 7520 7521 SDValue Ops[2] = { Lo, Hi }; 7522 return DAG.getMergeValues(Ops, 2, dl); 7523} 7524 7525SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7526 SelectionDAG &DAG) const { 7527 EVT SrcVT = Op.getOperand(0).getValueType(); 7528 7529 if (SrcVT.isVector()) 7530 return SDValue(); 7531 7532 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7533 "Unknown SINT_TO_FP to lower!"); 7534 7535 // These are really Legal; return the operand so the caller accepts it as 7536 // Legal. 7537 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7538 return Op; 7539 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7540 Subtarget->is64Bit()) { 7541 return Op; 7542 } 7543 7544 DebugLoc dl = Op.getDebugLoc(); 7545 unsigned Size = SrcVT.getSizeInBits()/8; 7546 MachineFunction &MF = DAG.getMachineFunction(); 7547 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7548 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7549 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7550 StackSlot, 7551 MachinePointerInfo::getFixedStack(SSFI), 7552 false, false, 0); 7553 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7554} 7555 7556SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7557 SDValue StackSlot, 7558 SelectionDAG &DAG) const { 7559 // Build the FILD 7560 DebugLoc DL = Op.getDebugLoc(); 7561 SDVTList Tys; 7562 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7563 if (useSSE) 7564 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7565 else 7566 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7567 7568 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7569 7570 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7571 MachineMemOperand *MMO; 7572 if (FI) { 7573 int SSFI = FI->getIndex(); 7574 MMO = 7575 DAG.getMachineFunction() 7576 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7577 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7578 } else { 7579 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7580 StackSlot = StackSlot.getOperand(1); 7581 } 7582 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7583 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7584 X86ISD::FILD, DL, 7585 Tys, Ops, array_lengthof(Ops), 7586 SrcVT, MMO); 7587 7588 if (useSSE) { 7589 Chain = Result.getValue(1); 7590 SDValue InFlag = Result.getValue(2); 7591 7592 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7593 // shouldn't be necessary except that RFP cannot be live across 7594 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7595 MachineFunction &MF = DAG.getMachineFunction(); 7596 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7597 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7598 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7599 Tys = DAG.getVTList(MVT::Other); 7600 SDValue Ops[] = { 7601 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7602 }; 7603 MachineMemOperand *MMO = 7604 DAG.getMachineFunction() 7605 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7606 MachineMemOperand::MOStore, SSFISize, SSFISize); 7607 7608 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7609 Ops, array_lengthof(Ops), 7610 Op.getValueType(), MMO); 7611 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7612 MachinePointerInfo::getFixedStack(SSFI), 7613 false, false, 0); 7614 } 7615 7616 return Result; 7617} 7618 7619// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7620SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7621 SelectionDAG &DAG) const { 7622 // This algorithm is not obvious. Here it is in C code, more or less: 7623 /* 7624 double uint64_to_double( uint32_t hi, uint32_t lo ) { 7625 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 7626 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 7627 7628 // Copy ints to xmm registers. 7629 __m128i xh = _mm_cvtsi32_si128( hi ); 7630 __m128i xl = _mm_cvtsi32_si128( lo ); 7631 7632 // Combine into low half of a single xmm register. 7633 __m128i x = _mm_unpacklo_epi32( xh, xl ); 7634 __m128d d; 7635 double sd; 7636 7637 // Merge in appropriate exponents to give the integer bits the right 7638 // magnitude. 7639 x = _mm_unpacklo_epi32( x, exp ); 7640 7641 // Subtract away the biases to deal with the IEEE-754 double precision 7642 // implicit 1. 7643 d = _mm_sub_pd( (__m128d) x, bias ); 7644 7645 // All conversions up to here are exact. The correctly rounded result is 7646 // calculated using the current rounding mode using the following 7647 // horizontal add. 7648 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7649 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7650 // store doesn't really need to be here (except 7651 // maybe to zero the other double) 7652 return sd; 7653 } 7654 */ 7655 7656 DebugLoc dl = Op.getDebugLoc(); 7657 LLVMContext *Context = DAG.getContext(); 7658 7659 // Build some magic constants. 7660 std::vector<Constant*> CV0; 7661 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7662 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7663 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7664 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7665 Constant *C0 = ConstantVector::get(CV0); 7666 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7667 7668 std::vector<Constant*> CV1; 7669 CV1.push_back( 7670 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7671 CV1.push_back( 7672 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7673 Constant *C1 = ConstantVector::get(CV1); 7674 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7675 7676 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7677 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7678 Op.getOperand(0), 7679 DAG.getIntPtrConstant(1))); 7680 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7681 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7682 Op.getOperand(0), 7683 DAG.getIntPtrConstant(0))); 7684 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7685 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7686 MachinePointerInfo::getConstantPool(), 7687 false, false, 16); 7688 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7689 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7690 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7691 MachinePointerInfo::getConstantPool(), 7692 false, false, 16); 7693 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7694 7695 // Add the halves; easiest way is to swap them into another reg first. 7696 int ShufMask[2] = { 1, -1 }; 7697 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7698 DAG.getUNDEF(MVT::v2f64), ShufMask); 7699 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7700 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7701 DAG.getIntPtrConstant(0)); 7702} 7703 7704// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7705SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7706 SelectionDAG &DAG) const { 7707 DebugLoc dl = Op.getDebugLoc(); 7708 // FP constant to bias correct the final result. 7709 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7710 MVT::f64); 7711 7712 // Load the 32-bit value into an XMM register. 7713 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7714 Op.getOperand(0)); 7715 7716 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7717 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7718 DAG.getIntPtrConstant(0)); 7719 7720 // Or the load with the bias. 7721 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7722 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7723 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7724 MVT::v2f64, Load)), 7725 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7726 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7727 MVT::v2f64, Bias))); 7728 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7729 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7730 DAG.getIntPtrConstant(0)); 7731 7732 // Subtract the bias. 7733 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7734 7735 // Handle final rounding. 7736 EVT DestVT = Op.getValueType(); 7737 7738 if (DestVT.bitsLT(MVT::f64)) { 7739 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7740 DAG.getIntPtrConstant(0)); 7741 } else if (DestVT.bitsGT(MVT::f64)) { 7742 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7743 } 7744 7745 // Handle final rounding. 7746 return Sub; 7747} 7748 7749SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7750 SelectionDAG &DAG) const { 7751 SDValue N0 = Op.getOperand(0); 7752 DebugLoc dl = Op.getDebugLoc(); 7753 7754 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7755 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7756 // the optimization here. 7757 if (DAG.SignBitIsZero(N0)) 7758 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7759 7760 EVT SrcVT = N0.getValueType(); 7761 EVT DstVT = Op.getValueType(); 7762 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7763 return LowerUINT_TO_FP_i64(Op, DAG); 7764 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7765 return LowerUINT_TO_FP_i32(Op, DAG); 7766 7767 // Make a 64-bit buffer, and use it to build an FILD. 7768 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7769 if (SrcVT == MVT::i32) { 7770 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7771 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7772 getPointerTy(), StackSlot, WordOff); 7773 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7774 StackSlot, MachinePointerInfo(), 7775 false, false, 0); 7776 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7777 OffsetSlot, MachinePointerInfo(), 7778 false, false, 0); 7779 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7780 return Fild; 7781 } 7782 7783 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7784 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7785 StackSlot, MachinePointerInfo(), 7786 false, false, 0); 7787 // For i64 source, we need to add the appropriate power of 2 if the input 7788 // was negative. This is the same as the optimization in 7789 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7790 // we must be careful to do the computation in x87 extended precision, not 7791 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7792 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7793 MachineMemOperand *MMO = 7794 DAG.getMachineFunction() 7795 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7796 MachineMemOperand::MOLoad, 8, 8); 7797 7798 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7799 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7800 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7801 MVT::i64, MMO); 7802 7803 APInt FF(32, 0x5F800000ULL); 7804 7805 // Check whether the sign bit is set. 7806 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7807 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7808 ISD::SETLT); 7809 7810 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7811 SDValue FudgePtr = DAG.getConstantPool( 7812 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7813 getPointerTy()); 7814 7815 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7816 SDValue Zero = DAG.getIntPtrConstant(0); 7817 SDValue Four = DAG.getIntPtrConstant(4); 7818 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7819 Zero, Four); 7820 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7821 7822 // Load the value out, extending it from f32 to f80. 7823 // FIXME: Avoid the extend by constructing the right constant pool? 7824 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7825 FudgePtr, MachinePointerInfo::getConstantPool(), 7826 MVT::f32, false, false, 4); 7827 // Extend everything to 80 bits to force it to be done on x87. 7828 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7829 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7830} 7831 7832std::pair<SDValue,SDValue> X86TargetLowering:: 7833FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7834 DebugLoc DL = Op.getDebugLoc(); 7835 7836 EVT DstTy = Op.getValueType(); 7837 7838 if (!IsSigned) { 7839 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7840 DstTy = MVT::i64; 7841 } 7842 7843 assert(DstTy.getSimpleVT() <= MVT::i64 && 7844 DstTy.getSimpleVT() >= MVT::i16 && 7845 "Unknown FP_TO_SINT to lower!"); 7846 7847 // These are really Legal. 7848 if (DstTy == MVT::i32 && 7849 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7850 return std::make_pair(SDValue(), SDValue()); 7851 if (Subtarget->is64Bit() && 7852 DstTy == MVT::i64 && 7853 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7854 return std::make_pair(SDValue(), SDValue()); 7855 7856 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7857 // stack slot. 7858 MachineFunction &MF = DAG.getMachineFunction(); 7859 unsigned MemSize = DstTy.getSizeInBits()/8; 7860 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7861 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7862 7863 7864 7865 unsigned Opc; 7866 switch (DstTy.getSimpleVT().SimpleTy) { 7867 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7868 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7869 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7870 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7871 } 7872 7873 SDValue Chain = DAG.getEntryNode(); 7874 SDValue Value = Op.getOperand(0); 7875 EVT TheVT = Op.getOperand(0).getValueType(); 7876 if (isScalarFPTypeInSSEReg(TheVT)) { 7877 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7878 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7879 MachinePointerInfo::getFixedStack(SSFI), 7880 false, false, 0); 7881 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7882 SDValue Ops[] = { 7883 Chain, StackSlot, DAG.getValueType(TheVT) 7884 }; 7885 7886 MachineMemOperand *MMO = 7887 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7888 MachineMemOperand::MOLoad, MemSize, MemSize); 7889 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7890 DstTy, MMO); 7891 Chain = Value.getValue(1); 7892 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7893 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7894 } 7895 7896 MachineMemOperand *MMO = 7897 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7898 MachineMemOperand::MOStore, MemSize, MemSize); 7899 7900 // Build the FP_TO_INT*_IN_MEM 7901 SDValue Ops[] = { Chain, Value, StackSlot }; 7902 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7903 Ops, 3, DstTy, MMO); 7904 7905 return std::make_pair(FIST, StackSlot); 7906} 7907 7908SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7909 SelectionDAG &DAG) const { 7910 if (Op.getValueType().isVector()) 7911 return SDValue(); 7912 7913 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7914 SDValue FIST = Vals.first, StackSlot = Vals.second; 7915 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7916 if (FIST.getNode() == 0) return Op; 7917 7918 // Load the result. 7919 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7920 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7921} 7922 7923SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7924 SelectionDAG &DAG) const { 7925 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7926 SDValue FIST = Vals.first, StackSlot = Vals.second; 7927 assert(FIST.getNode() && "Unexpected failure"); 7928 7929 // Load the result. 7930 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7931 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7932} 7933 7934SDValue X86TargetLowering::LowerFABS(SDValue Op, 7935 SelectionDAG &DAG) const { 7936 LLVMContext *Context = DAG.getContext(); 7937 DebugLoc dl = Op.getDebugLoc(); 7938 EVT VT = Op.getValueType(); 7939 EVT EltVT = VT; 7940 if (VT.isVector()) 7941 EltVT = VT.getVectorElementType(); 7942 std::vector<Constant*> CV; 7943 if (EltVT == MVT::f64) { 7944 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7945 CV.push_back(C); 7946 CV.push_back(C); 7947 } else { 7948 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7949 CV.push_back(C); 7950 CV.push_back(C); 7951 CV.push_back(C); 7952 CV.push_back(C); 7953 } 7954 Constant *C = ConstantVector::get(CV); 7955 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7956 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7957 MachinePointerInfo::getConstantPool(), 7958 false, false, 16); 7959 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7960} 7961 7962SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7963 LLVMContext *Context = DAG.getContext(); 7964 DebugLoc dl = Op.getDebugLoc(); 7965 EVT VT = Op.getValueType(); 7966 EVT EltVT = VT; 7967 if (VT.isVector()) 7968 EltVT = VT.getVectorElementType(); 7969 std::vector<Constant*> CV; 7970 if (EltVT == MVT::f64) { 7971 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7972 CV.push_back(C); 7973 CV.push_back(C); 7974 } else { 7975 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7976 CV.push_back(C); 7977 CV.push_back(C); 7978 CV.push_back(C); 7979 CV.push_back(C); 7980 } 7981 Constant *C = ConstantVector::get(CV); 7982 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7983 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7984 MachinePointerInfo::getConstantPool(), 7985 false, false, 16); 7986 if (VT.isVector()) { 7987 return DAG.getNode(ISD::BITCAST, dl, VT, 7988 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7989 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7990 Op.getOperand(0)), 7991 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7992 } else { 7993 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7994 } 7995} 7996 7997SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7998 LLVMContext *Context = DAG.getContext(); 7999 SDValue Op0 = Op.getOperand(0); 8000 SDValue Op1 = Op.getOperand(1); 8001 DebugLoc dl = Op.getDebugLoc(); 8002 EVT VT = Op.getValueType(); 8003 EVT SrcVT = Op1.getValueType(); 8004 8005 // If second operand is smaller, extend it first. 8006 if (SrcVT.bitsLT(VT)) { 8007 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 8008 SrcVT = VT; 8009 } 8010 // And if it is bigger, shrink it first. 8011 if (SrcVT.bitsGT(VT)) { 8012 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 8013 SrcVT = VT; 8014 } 8015 8016 // At this point the operands and the result should have the same 8017 // type, and that won't be f80 since that is not custom lowered. 8018 8019 // First get the sign bit of second operand. 8020 std::vector<Constant*> CV; 8021 if (SrcVT == MVT::f64) { 8022 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 8023 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8024 } else { 8025 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 8026 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8027 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8028 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8029 } 8030 Constant *C = ConstantVector::get(CV); 8031 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8032 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 8033 MachinePointerInfo::getConstantPool(), 8034 false, false, 16); 8035 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 8036 8037 // Shift sign bit right or left if the two operands have different types. 8038 if (SrcVT.bitsGT(VT)) { 8039 // Op0 is MVT::f32, Op1 is MVT::f64. 8040 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 8041 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 8042 DAG.getConstant(32, MVT::i32)); 8043 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 8044 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 8045 DAG.getIntPtrConstant(0)); 8046 } 8047 8048 // Clear first operand sign bit. 8049 CV.clear(); 8050 if (VT == MVT::f64) { 8051 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8052 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8053 } else { 8054 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8055 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8056 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8057 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8058 } 8059 C = ConstantVector::get(CV); 8060 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8061 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8062 MachinePointerInfo::getConstantPool(), 8063 false, false, 16); 8064 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 8065 8066 // Or the value with the sign bit. 8067 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 8068} 8069 8070SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 8071 SDValue N0 = Op.getOperand(0); 8072 DebugLoc dl = Op.getDebugLoc(); 8073 EVT VT = Op.getValueType(); 8074 8075 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 8076 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 8077 DAG.getConstant(1, VT)); 8078 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 8079} 8080 8081/// Emit nodes that will be selected as "test Op0,Op0", or something 8082/// equivalent. 8083SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 8084 SelectionDAG &DAG) const { 8085 DebugLoc dl = Op.getDebugLoc(); 8086 8087 // CF and OF aren't always set the way we want. Determine which 8088 // of these we need. 8089 bool NeedCF = false; 8090 bool NeedOF = false; 8091 switch (X86CC) { 8092 default: break; 8093 case X86::COND_A: case X86::COND_AE: 8094 case X86::COND_B: case X86::COND_BE: 8095 NeedCF = true; 8096 break; 8097 case X86::COND_G: case X86::COND_GE: 8098 case X86::COND_L: case X86::COND_LE: 8099 case X86::COND_O: case X86::COND_NO: 8100 NeedOF = true; 8101 break; 8102 } 8103 8104 // See if we can use the EFLAGS value from the operand instead of 8105 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8106 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8107 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8108 // Emit a CMP with 0, which is the TEST pattern. 8109 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8110 DAG.getConstant(0, Op.getValueType())); 8111 8112 unsigned Opcode = 0; 8113 unsigned NumOperands = 0; 8114 switch (Op.getNode()->getOpcode()) { 8115 case ISD::ADD: 8116 // Due to an isel shortcoming, be conservative if this add is likely to be 8117 // selected as part of a load-modify-store instruction. When the root node 8118 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8119 // uses of other nodes in the match, such as the ADD in this case. This 8120 // leads to the ADD being left around and reselected, with the result being 8121 // two adds in the output. Alas, even if none our users are stores, that 8122 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8123 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8124 // climbing the DAG back to the root, and it doesn't seem to be worth the 8125 // effort. 8126 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8127 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8128 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 8129 goto default_case; 8130 8131 if (ConstantSDNode *C = 8132 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 8133 // An add of one will be selected as an INC. 8134 if (C->getAPIntValue() == 1) { 8135 Opcode = X86ISD::INC; 8136 NumOperands = 1; 8137 break; 8138 } 8139 8140 // An add of negative one (subtract of one) will be selected as a DEC. 8141 if (C->getAPIntValue().isAllOnesValue()) { 8142 Opcode = X86ISD::DEC; 8143 NumOperands = 1; 8144 break; 8145 } 8146 } 8147 8148 // Otherwise use a regular EFLAGS-setting add. 8149 Opcode = X86ISD::ADD; 8150 NumOperands = 2; 8151 break; 8152 case ISD::AND: { 8153 // If the primary and result isn't used, don't bother using X86ISD::AND, 8154 // because a TEST instruction will be better. 8155 bool NonFlagUse = false; 8156 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8157 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8158 SDNode *User = *UI; 8159 unsigned UOpNo = UI.getOperandNo(); 8160 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8161 // Look pass truncate. 8162 UOpNo = User->use_begin().getOperandNo(); 8163 User = *User->use_begin(); 8164 } 8165 8166 if (User->getOpcode() != ISD::BRCOND && 8167 User->getOpcode() != ISD::SETCC && 8168 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 8169 NonFlagUse = true; 8170 break; 8171 } 8172 } 8173 8174 if (!NonFlagUse) 8175 break; 8176 } 8177 // FALL THROUGH 8178 case ISD::SUB: 8179 case ISD::OR: 8180 case ISD::XOR: 8181 // Due to the ISEL shortcoming noted above, be conservative if this op is 8182 // likely to be selected as part of a load-modify-store instruction. 8183 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8184 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8185 if (UI->getOpcode() == ISD::STORE) 8186 goto default_case; 8187 8188 // Otherwise use a regular EFLAGS-setting instruction. 8189 switch (Op.getNode()->getOpcode()) { 8190 default: llvm_unreachable("unexpected operator!"); 8191 case ISD::SUB: Opcode = X86ISD::SUB; break; 8192 case ISD::OR: Opcode = X86ISD::OR; break; 8193 case ISD::XOR: Opcode = X86ISD::XOR; break; 8194 case ISD::AND: Opcode = X86ISD::AND; break; 8195 } 8196 8197 NumOperands = 2; 8198 break; 8199 case X86ISD::ADD: 8200 case X86ISD::SUB: 8201 case X86ISD::INC: 8202 case X86ISD::DEC: 8203 case X86ISD::OR: 8204 case X86ISD::XOR: 8205 case X86ISD::AND: 8206 return SDValue(Op.getNode(), 1); 8207 default: 8208 default_case: 8209 break; 8210 } 8211 8212 if (Opcode == 0) 8213 // Emit a CMP with 0, which is the TEST pattern. 8214 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8215 DAG.getConstant(0, Op.getValueType())); 8216 8217 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8218 SmallVector<SDValue, 4> Ops; 8219 for (unsigned i = 0; i != NumOperands; ++i) 8220 Ops.push_back(Op.getOperand(i)); 8221 8222 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8223 DAG.ReplaceAllUsesWith(Op, New); 8224 return SDValue(New.getNode(), 1); 8225} 8226 8227/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8228/// equivalent. 8229SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8230 SelectionDAG &DAG) const { 8231 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8232 if (C->getAPIntValue() == 0) 8233 return EmitTest(Op0, X86CC, DAG); 8234 8235 DebugLoc dl = Op0.getDebugLoc(); 8236 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8237} 8238 8239/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8240/// if it's possible. 8241SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8242 DebugLoc dl, SelectionDAG &DAG) const { 8243 SDValue Op0 = And.getOperand(0); 8244 SDValue Op1 = And.getOperand(1); 8245 if (Op0.getOpcode() == ISD::TRUNCATE) 8246 Op0 = Op0.getOperand(0); 8247 if (Op1.getOpcode() == ISD::TRUNCATE) 8248 Op1 = Op1.getOperand(0); 8249 8250 SDValue LHS, RHS; 8251 if (Op1.getOpcode() == ISD::SHL) 8252 std::swap(Op0, Op1); 8253 if (Op0.getOpcode() == ISD::SHL) { 8254 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8255 if (And00C->getZExtValue() == 1) { 8256 // If we looked past a truncate, check that it's only truncating away 8257 // known zeros. 8258 unsigned BitWidth = Op0.getValueSizeInBits(); 8259 unsigned AndBitWidth = And.getValueSizeInBits(); 8260 if (BitWidth > AndBitWidth) { 8261 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 8262 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 8263 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8264 return SDValue(); 8265 } 8266 LHS = Op1; 8267 RHS = Op0.getOperand(1); 8268 } 8269 } else if (Op1.getOpcode() == ISD::Constant) { 8270 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8271 SDValue AndLHS = Op0; 8272 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 8273 LHS = AndLHS.getOperand(0); 8274 RHS = AndLHS.getOperand(1); 8275 } 8276 } 8277 8278 if (LHS.getNode()) { 8279 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8280 // instruction. Since the shift amount is in-range-or-undefined, we know 8281 // that doing a bittest on the i32 value is ok. We extend to i32 because 8282 // the encoding for the i16 version is larger than the i32 version. 8283 // Also promote i16 to i32 for performance / code size reason. 8284 if (LHS.getValueType() == MVT::i8 || 8285 LHS.getValueType() == MVT::i16) 8286 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8287 8288 // If the operand types disagree, extend the shift amount to match. Since 8289 // BT ignores high bits (like shifts) we can use anyextend. 8290 if (LHS.getValueType() != RHS.getValueType()) 8291 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8292 8293 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8294 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8295 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8296 DAG.getConstant(Cond, MVT::i8), BT); 8297 } 8298 8299 return SDValue(); 8300} 8301 8302SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8303 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8304 SDValue Op0 = Op.getOperand(0); 8305 SDValue Op1 = Op.getOperand(1); 8306 DebugLoc dl = Op.getDebugLoc(); 8307 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8308 8309 // Optimize to BT if possible. 8310 // Lower (X & (1 << N)) == 0 to BT(X, N). 8311 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8312 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8313 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8314 Op1.getOpcode() == ISD::Constant && 8315 cast<ConstantSDNode>(Op1)->isNullValue() && 8316 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8317 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8318 if (NewSetCC.getNode()) 8319 return NewSetCC; 8320 } 8321 8322 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8323 // these. 8324 if (Op1.getOpcode() == ISD::Constant && 8325 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8326 cast<ConstantSDNode>(Op1)->isNullValue()) && 8327 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8328 8329 // If the input is a setcc, then reuse the input setcc or use a new one with 8330 // the inverted condition. 8331 if (Op0.getOpcode() == X86ISD::SETCC) { 8332 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8333 bool Invert = (CC == ISD::SETNE) ^ 8334 cast<ConstantSDNode>(Op1)->isNullValue(); 8335 if (!Invert) return Op0; 8336 8337 CCode = X86::GetOppositeBranchCondition(CCode); 8338 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8339 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8340 } 8341 } 8342 8343 bool isFP = Op1.getValueType().isFloatingPoint(); 8344 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8345 if (X86CC == X86::COND_INVALID) 8346 return SDValue(); 8347 8348 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8349 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8350 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8351} 8352 8353// Lower256IntVETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 8354// ones, and then concatenate the result back. 8355static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) { 8356 EVT VT = Op.getValueType(); 8357 8358 assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::VSETCC && 8359 "Unsupported value type for operation"); 8360 8361 int NumElems = VT.getVectorNumElements(); 8362 DebugLoc dl = Op.getDebugLoc(); 8363 SDValue CC = Op.getOperand(2); 8364 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 8365 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 8366 8367 // Extract the LHS vectors 8368 SDValue LHS = Op.getOperand(0); 8369 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 8370 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 8371 8372 // Extract the RHS vectors 8373 SDValue RHS = Op.getOperand(1); 8374 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 8375 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 8376 8377 // Issue the operation on the smaller types and concatenate the result back 8378 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 8379 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 8380 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 8381 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 8382 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 8383} 8384 8385 8386SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8387 SDValue Cond; 8388 SDValue Op0 = Op.getOperand(0); 8389 SDValue Op1 = Op.getOperand(1); 8390 SDValue CC = Op.getOperand(2); 8391 EVT VT = Op.getValueType(); 8392 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8393 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8394 DebugLoc dl = Op.getDebugLoc(); 8395 8396 if (isFP) { 8397 unsigned SSECC = 8; 8398 EVT EltVT = Op0.getValueType().getVectorElementType(); 8399 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 8400 8401 unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 8402 bool Swap = false; 8403 8404 switch (SetCCOpcode) { 8405 default: break; 8406 case ISD::SETOEQ: 8407 case ISD::SETEQ: SSECC = 0; break; 8408 case ISD::SETOGT: 8409 case ISD::SETGT: Swap = true; // Fallthrough 8410 case ISD::SETLT: 8411 case ISD::SETOLT: SSECC = 1; break; 8412 case ISD::SETOGE: 8413 case ISD::SETGE: Swap = true; // Fallthrough 8414 case ISD::SETLE: 8415 case ISD::SETOLE: SSECC = 2; break; 8416 case ISD::SETUO: SSECC = 3; break; 8417 case ISD::SETUNE: 8418 case ISD::SETNE: SSECC = 4; break; 8419 case ISD::SETULE: Swap = true; 8420 case ISD::SETUGE: SSECC = 5; break; 8421 case ISD::SETULT: Swap = true; 8422 case ISD::SETUGT: SSECC = 6; break; 8423 case ISD::SETO: SSECC = 7; break; 8424 } 8425 if (Swap) 8426 std::swap(Op0, Op1); 8427 8428 // In the two special cases we can't handle, emit two comparisons. 8429 if (SSECC == 8) { 8430 if (SetCCOpcode == ISD::SETUEQ) { 8431 SDValue UNORD, EQ; 8432 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 8433 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 8434 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 8435 } 8436 else if (SetCCOpcode == ISD::SETONE) { 8437 SDValue ORD, NEQ; 8438 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 8439 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 8440 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 8441 } 8442 llvm_unreachable("Illegal FP comparison"); 8443 } 8444 // Handle all other FP comparisons here. 8445 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 8446 } 8447 8448 // Break 256-bit integer vector compare into smaller ones. 8449 if (!isFP && VT.getSizeInBits() == 256) 8450 return Lower256IntVETCC(Op, DAG); 8451 8452 // We are handling one of the integer comparisons here. Since SSE only has 8453 // GT and EQ comparisons for integer, swapping operands and multiple 8454 // operations may be required for some comparisons. 8455 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 8456 bool Swap = false, Invert = false, FlipSigns = false; 8457 8458 switch (VT.getSimpleVT().SimpleTy) { 8459 default: break; 8460 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 8461 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 8462 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 8463 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 8464 } 8465 8466 switch (SetCCOpcode) { 8467 default: break; 8468 case ISD::SETNE: Invert = true; 8469 case ISD::SETEQ: Opc = EQOpc; break; 8470 case ISD::SETLT: Swap = true; 8471 case ISD::SETGT: Opc = GTOpc; break; 8472 case ISD::SETGE: Swap = true; 8473 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 8474 case ISD::SETULT: Swap = true; 8475 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 8476 case ISD::SETUGE: Swap = true; 8477 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 8478 } 8479 if (Swap) 8480 std::swap(Op0, Op1); 8481 8482 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8483 // bits of the inputs before performing those operations. 8484 if (FlipSigns) { 8485 EVT EltVT = VT.getVectorElementType(); 8486 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8487 EltVT); 8488 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8489 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8490 SignBits.size()); 8491 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8492 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8493 } 8494 8495 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8496 8497 // If the logical-not of the result is required, perform that now. 8498 if (Invert) 8499 Result = DAG.getNOT(dl, Result, VT); 8500 8501 return Result; 8502} 8503 8504// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8505static bool isX86LogicalCmp(SDValue Op) { 8506 unsigned Opc = Op.getNode()->getOpcode(); 8507 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 8508 return true; 8509 if (Op.getResNo() == 1 && 8510 (Opc == X86ISD::ADD || 8511 Opc == X86ISD::SUB || 8512 Opc == X86ISD::ADC || 8513 Opc == X86ISD::SBB || 8514 Opc == X86ISD::SMUL || 8515 Opc == X86ISD::UMUL || 8516 Opc == X86ISD::INC || 8517 Opc == X86ISD::DEC || 8518 Opc == X86ISD::OR || 8519 Opc == X86ISD::XOR || 8520 Opc == X86ISD::AND)) 8521 return true; 8522 8523 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8524 return true; 8525 8526 return false; 8527} 8528 8529static bool isZero(SDValue V) { 8530 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8531 return C && C->isNullValue(); 8532} 8533 8534static bool isAllOnes(SDValue V) { 8535 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8536 return C && C->isAllOnesValue(); 8537} 8538 8539SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8540 bool addTest = true; 8541 SDValue Cond = Op.getOperand(0); 8542 SDValue Op1 = Op.getOperand(1); 8543 SDValue Op2 = Op.getOperand(2); 8544 DebugLoc DL = Op.getDebugLoc(); 8545 SDValue CC; 8546 8547 if (Cond.getOpcode() == ISD::SETCC) { 8548 SDValue NewCond = LowerSETCC(Cond, DAG); 8549 if (NewCond.getNode()) 8550 Cond = NewCond; 8551 } 8552 8553 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8554 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8555 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8556 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8557 if (Cond.getOpcode() == X86ISD::SETCC && 8558 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8559 isZero(Cond.getOperand(1).getOperand(1))) { 8560 SDValue Cmp = Cond.getOperand(1); 8561 8562 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8563 8564 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8565 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8566 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8567 8568 SDValue CmpOp0 = Cmp.getOperand(0); 8569 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8570 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8571 8572 SDValue Res = // Res = 0 or -1. 8573 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8574 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8575 8576 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8577 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8578 8579 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8580 if (N2C == 0 || !N2C->isNullValue()) 8581 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8582 return Res; 8583 } 8584 } 8585 8586 // Look past (and (setcc_carry (cmp ...)), 1). 8587 if (Cond.getOpcode() == ISD::AND && 8588 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8589 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8590 if (C && C->getAPIntValue() == 1) 8591 Cond = Cond.getOperand(0); 8592 } 8593 8594 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8595 // setting operand in place of the X86ISD::SETCC. 8596 if (Cond.getOpcode() == X86ISD::SETCC || 8597 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8598 CC = Cond.getOperand(0); 8599 8600 SDValue Cmp = Cond.getOperand(1); 8601 unsigned Opc = Cmp.getOpcode(); 8602 EVT VT = Op.getValueType(); 8603 8604 bool IllegalFPCMov = false; 8605 if (VT.isFloatingPoint() && !VT.isVector() && 8606 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8607 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8608 8609 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8610 Opc == X86ISD::BT) { // FIXME 8611 Cond = Cmp; 8612 addTest = false; 8613 } 8614 } 8615 8616 if (addTest) { 8617 // Look pass the truncate. 8618 if (Cond.getOpcode() == ISD::TRUNCATE) 8619 Cond = Cond.getOperand(0); 8620 8621 // We know the result of AND is compared against zero. Try to match 8622 // it to BT. 8623 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8624 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8625 if (NewSetCC.getNode()) { 8626 CC = NewSetCC.getOperand(0); 8627 Cond = NewSetCC.getOperand(1); 8628 addTest = false; 8629 } 8630 } 8631 } 8632 8633 if (addTest) { 8634 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8635 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8636 } 8637 8638 // a < b ? -1 : 0 -> RES = ~setcc_carry 8639 // a < b ? 0 : -1 -> RES = setcc_carry 8640 // a >= b ? -1 : 0 -> RES = setcc_carry 8641 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8642 if (Cond.getOpcode() == X86ISD::CMP) { 8643 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8644 8645 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8646 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8647 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8648 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8649 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8650 return DAG.getNOT(DL, Res, Res.getValueType()); 8651 return Res; 8652 } 8653 } 8654 8655 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8656 // condition is true. 8657 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8658 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8659 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8660} 8661 8662// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8663// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8664// from the AND / OR. 8665static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8666 Opc = Op.getOpcode(); 8667 if (Opc != ISD::OR && Opc != ISD::AND) 8668 return false; 8669 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8670 Op.getOperand(0).hasOneUse() && 8671 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8672 Op.getOperand(1).hasOneUse()); 8673} 8674 8675// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8676// 1 and that the SETCC node has a single use. 8677static bool isXor1OfSetCC(SDValue Op) { 8678 if (Op.getOpcode() != ISD::XOR) 8679 return false; 8680 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8681 if (N1C && N1C->getAPIntValue() == 1) { 8682 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8683 Op.getOperand(0).hasOneUse(); 8684 } 8685 return false; 8686} 8687 8688SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8689 bool addTest = true; 8690 SDValue Chain = Op.getOperand(0); 8691 SDValue Cond = Op.getOperand(1); 8692 SDValue Dest = Op.getOperand(2); 8693 DebugLoc dl = Op.getDebugLoc(); 8694 SDValue CC; 8695 8696 if (Cond.getOpcode() == ISD::SETCC) { 8697 SDValue NewCond = LowerSETCC(Cond, DAG); 8698 if (NewCond.getNode()) 8699 Cond = NewCond; 8700 } 8701#if 0 8702 // FIXME: LowerXALUO doesn't handle these!! 8703 else if (Cond.getOpcode() == X86ISD::ADD || 8704 Cond.getOpcode() == X86ISD::SUB || 8705 Cond.getOpcode() == X86ISD::SMUL || 8706 Cond.getOpcode() == X86ISD::UMUL) 8707 Cond = LowerXALUO(Cond, DAG); 8708#endif 8709 8710 // Look pass (and (setcc_carry (cmp ...)), 1). 8711 if (Cond.getOpcode() == ISD::AND && 8712 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8713 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8714 if (C && C->getAPIntValue() == 1) 8715 Cond = Cond.getOperand(0); 8716 } 8717 8718 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8719 // setting operand in place of the X86ISD::SETCC. 8720 if (Cond.getOpcode() == X86ISD::SETCC || 8721 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8722 CC = Cond.getOperand(0); 8723 8724 SDValue Cmp = Cond.getOperand(1); 8725 unsigned Opc = Cmp.getOpcode(); 8726 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8727 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8728 Cond = Cmp; 8729 addTest = false; 8730 } else { 8731 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8732 default: break; 8733 case X86::COND_O: 8734 case X86::COND_B: 8735 // These can only come from an arithmetic instruction with overflow, 8736 // e.g. SADDO, UADDO. 8737 Cond = Cond.getNode()->getOperand(1); 8738 addTest = false; 8739 break; 8740 } 8741 } 8742 } else { 8743 unsigned CondOpc; 8744 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8745 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8746 if (CondOpc == ISD::OR) { 8747 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8748 // two branches instead of an explicit OR instruction with a 8749 // separate test. 8750 if (Cmp == Cond.getOperand(1).getOperand(1) && 8751 isX86LogicalCmp(Cmp)) { 8752 CC = Cond.getOperand(0).getOperand(0); 8753 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8754 Chain, Dest, CC, Cmp); 8755 CC = Cond.getOperand(1).getOperand(0); 8756 Cond = Cmp; 8757 addTest = false; 8758 } 8759 } else { // ISD::AND 8760 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8761 // two branches instead of an explicit AND instruction with a 8762 // separate test. However, we only do this if this block doesn't 8763 // have a fall-through edge, because this requires an explicit 8764 // jmp when the condition is false. 8765 if (Cmp == Cond.getOperand(1).getOperand(1) && 8766 isX86LogicalCmp(Cmp) && 8767 Op.getNode()->hasOneUse()) { 8768 X86::CondCode CCode = 8769 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8770 CCode = X86::GetOppositeBranchCondition(CCode); 8771 CC = DAG.getConstant(CCode, MVT::i8); 8772 SDNode *User = *Op.getNode()->use_begin(); 8773 // Look for an unconditional branch following this conditional branch. 8774 // We need this because we need to reverse the successors in order 8775 // to implement FCMP_OEQ. 8776 if (User->getOpcode() == ISD::BR) { 8777 SDValue FalseBB = User->getOperand(1); 8778 SDNode *NewBR = 8779 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8780 assert(NewBR == User); 8781 (void)NewBR; 8782 Dest = FalseBB; 8783 8784 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8785 Chain, Dest, CC, Cmp); 8786 X86::CondCode CCode = 8787 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8788 CCode = X86::GetOppositeBranchCondition(CCode); 8789 CC = DAG.getConstant(CCode, MVT::i8); 8790 Cond = Cmp; 8791 addTest = false; 8792 } 8793 } 8794 } 8795 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8796 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8797 // It should be transformed during dag combiner except when the condition 8798 // is set by a arithmetics with overflow node. 8799 X86::CondCode CCode = 8800 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8801 CCode = X86::GetOppositeBranchCondition(CCode); 8802 CC = DAG.getConstant(CCode, MVT::i8); 8803 Cond = Cond.getOperand(0).getOperand(1); 8804 addTest = false; 8805 } 8806 } 8807 8808 if (addTest) { 8809 // Look pass the truncate. 8810 if (Cond.getOpcode() == ISD::TRUNCATE) 8811 Cond = Cond.getOperand(0); 8812 8813 // We know the result of AND is compared against zero. Try to match 8814 // it to BT. 8815 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8816 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8817 if (NewSetCC.getNode()) { 8818 CC = NewSetCC.getOperand(0); 8819 Cond = NewSetCC.getOperand(1); 8820 addTest = false; 8821 } 8822 } 8823 } 8824 8825 if (addTest) { 8826 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8827 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8828 } 8829 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8830 Chain, Dest, CC, Cond); 8831} 8832 8833 8834// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8835// Calls to _alloca is needed to probe the stack when allocating more than 4k 8836// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8837// that the guard pages used by the OS virtual memory manager are allocated in 8838// correct sequence. 8839SDValue 8840X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8841 SelectionDAG &DAG) const { 8842 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 8843 "This should be used only on Windows targets"); 8844 assert(!Subtarget->isTargetEnvMacho()); 8845 DebugLoc dl = Op.getDebugLoc(); 8846 8847 // Get the inputs. 8848 SDValue Chain = Op.getOperand(0); 8849 SDValue Size = Op.getOperand(1); 8850 // FIXME: Ensure alignment here 8851 8852 SDValue Flag; 8853 8854 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 8855 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8856 8857 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8858 Flag = Chain.getValue(1); 8859 8860 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8861 8862 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8863 Flag = Chain.getValue(1); 8864 8865 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8866 8867 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8868 return DAG.getMergeValues(Ops1, 2, dl); 8869} 8870 8871SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8872 MachineFunction &MF = DAG.getMachineFunction(); 8873 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8874 8875 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8876 DebugLoc DL = Op.getDebugLoc(); 8877 8878 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8879 // vastart just stores the address of the VarArgsFrameIndex slot into the 8880 // memory location argument. 8881 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8882 getPointerTy()); 8883 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8884 MachinePointerInfo(SV), false, false, 0); 8885 } 8886 8887 // __va_list_tag: 8888 // gp_offset (0 - 6 * 8) 8889 // fp_offset (48 - 48 + 8 * 16) 8890 // overflow_arg_area (point to parameters coming in memory). 8891 // reg_save_area 8892 SmallVector<SDValue, 8> MemOps; 8893 SDValue FIN = Op.getOperand(1); 8894 // Store gp_offset 8895 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 8896 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 8897 MVT::i32), 8898 FIN, MachinePointerInfo(SV), false, false, 0); 8899 MemOps.push_back(Store); 8900 8901 // Store fp_offset 8902 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8903 FIN, DAG.getIntPtrConstant(4)); 8904 Store = DAG.getStore(Op.getOperand(0), DL, 8905 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8906 MVT::i32), 8907 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8908 MemOps.push_back(Store); 8909 8910 // Store ptr to overflow_arg_area 8911 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8912 FIN, DAG.getIntPtrConstant(4)); 8913 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8914 getPointerTy()); 8915 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8916 MachinePointerInfo(SV, 8), 8917 false, false, 0); 8918 MemOps.push_back(Store); 8919 8920 // Store ptr to reg_save_area. 8921 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8922 FIN, DAG.getIntPtrConstant(8)); 8923 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8924 getPointerTy()); 8925 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8926 MachinePointerInfo(SV, 16), false, false, 0); 8927 MemOps.push_back(Store); 8928 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8929 &MemOps[0], MemOps.size()); 8930} 8931 8932SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8933 assert(Subtarget->is64Bit() && 8934 "LowerVAARG only handles 64-bit va_arg!"); 8935 assert((Subtarget->isTargetLinux() || 8936 Subtarget->isTargetDarwin()) && 8937 "Unhandled target in LowerVAARG"); 8938 assert(Op.getNode()->getNumOperands() == 4); 8939 SDValue Chain = Op.getOperand(0); 8940 SDValue SrcPtr = Op.getOperand(1); 8941 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8942 unsigned Align = Op.getConstantOperandVal(3); 8943 DebugLoc dl = Op.getDebugLoc(); 8944 8945 EVT ArgVT = Op.getNode()->getValueType(0); 8946 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8947 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8948 uint8_t ArgMode; 8949 8950 // Decide which area this value should be read from. 8951 // TODO: Implement the AMD64 ABI in its entirety. This simple 8952 // selection mechanism works only for the basic types. 8953 if (ArgVT == MVT::f80) { 8954 llvm_unreachable("va_arg for f80 not yet implemented"); 8955 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8956 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8957 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8958 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8959 } else { 8960 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8961 } 8962 8963 if (ArgMode == 2) { 8964 // Sanity Check: Make sure using fp_offset makes sense. 8965 assert(!UseSoftFloat && 8966 !(DAG.getMachineFunction() 8967 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8968 Subtarget->hasXMM()); 8969 } 8970 8971 // Insert VAARG_64 node into the DAG 8972 // VAARG_64 returns two values: Variable Argument Address, Chain 8973 SmallVector<SDValue, 11> InstOps; 8974 InstOps.push_back(Chain); 8975 InstOps.push_back(SrcPtr); 8976 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8977 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8978 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8979 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8980 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8981 VTs, &InstOps[0], InstOps.size(), 8982 MVT::i64, 8983 MachinePointerInfo(SV), 8984 /*Align=*/0, 8985 /*Volatile=*/false, 8986 /*ReadMem=*/true, 8987 /*WriteMem=*/true); 8988 Chain = VAARG.getValue(1); 8989 8990 // Load the next argument and return it 8991 return DAG.getLoad(ArgVT, dl, 8992 Chain, 8993 VAARG, 8994 MachinePointerInfo(), 8995 false, false, 0); 8996} 8997 8998SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8999 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 9000 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 9001 SDValue Chain = Op.getOperand(0); 9002 SDValue DstPtr = Op.getOperand(1); 9003 SDValue SrcPtr = Op.getOperand(2); 9004 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9005 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9006 DebugLoc DL = Op.getDebugLoc(); 9007 9008 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 9009 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 9010 false, 9011 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 9012} 9013 9014SDValue 9015X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 9016 DebugLoc dl = Op.getDebugLoc(); 9017 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9018 switch (IntNo) { 9019 default: return SDValue(); // Don't custom lower most intrinsics. 9020 // Comparison intrinsics. 9021 case Intrinsic::x86_sse_comieq_ss: 9022 case Intrinsic::x86_sse_comilt_ss: 9023 case Intrinsic::x86_sse_comile_ss: 9024 case Intrinsic::x86_sse_comigt_ss: 9025 case Intrinsic::x86_sse_comige_ss: 9026 case Intrinsic::x86_sse_comineq_ss: 9027 case Intrinsic::x86_sse_ucomieq_ss: 9028 case Intrinsic::x86_sse_ucomilt_ss: 9029 case Intrinsic::x86_sse_ucomile_ss: 9030 case Intrinsic::x86_sse_ucomigt_ss: 9031 case Intrinsic::x86_sse_ucomige_ss: 9032 case Intrinsic::x86_sse_ucomineq_ss: 9033 case Intrinsic::x86_sse2_comieq_sd: 9034 case Intrinsic::x86_sse2_comilt_sd: 9035 case Intrinsic::x86_sse2_comile_sd: 9036 case Intrinsic::x86_sse2_comigt_sd: 9037 case Intrinsic::x86_sse2_comige_sd: 9038 case Intrinsic::x86_sse2_comineq_sd: 9039 case Intrinsic::x86_sse2_ucomieq_sd: 9040 case Intrinsic::x86_sse2_ucomilt_sd: 9041 case Intrinsic::x86_sse2_ucomile_sd: 9042 case Intrinsic::x86_sse2_ucomigt_sd: 9043 case Intrinsic::x86_sse2_ucomige_sd: 9044 case Intrinsic::x86_sse2_ucomineq_sd: { 9045 unsigned Opc = 0; 9046 ISD::CondCode CC = ISD::SETCC_INVALID; 9047 switch (IntNo) { 9048 default: break; 9049 case Intrinsic::x86_sse_comieq_ss: 9050 case Intrinsic::x86_sse2_comieq_sd: 9051 Opc = X86ISD::COMI; 9052 CC = ISD::SETEQ; 9053 break; 9054 case Intrinsic::x86_sse_comilt_ss: 9055 case Intrinsic::x86_sse2_comilt_sd: 9056 Opc = X86ISD::COMI; 9057 CC = ISD::SETLT; 9058 break; 9059 case Intrinsic::x86_sse_comile_ss: 9060 case Intrinsic::x86_sse2_comile_sd: 9061 Opc = X86ISD::COMI; 9062 CC = ISD::SETLE; 9063 break; 9064 case Intrinsic::x86_sse_comigt_ss: 9065 case Intrinsic::x86_sse2_comigt_sd: 9066 Opc = X86ISD::COMI; 9067 CC = ISD::SETGT; 9068 break; 9069 case Intrinsic::x86_sse_comige_ss: 9070 case Intrinsic::x86_sse2_comige_sd: 9071 Opc = X86ISD::COMI; 9072 CC = ISD::SETGE; 9073 break; 9074 case Intrinsic::x86_sse_comineq_ss: 9075 case Intrinsic::x86_sse2_comineq_sd: 9076 Opc = X86ISD::COMI; 9077 CC = ISD::SETNE; 9078 break; 9079 case Intrinsic::x86_sse_ucomieq_ss: 9080 case Intrinsic::x86_sse2_ucomieq_sd: 9081 Opc = X86ISD::UCOMI; 9082 CC = ISD::SETEQ; 9083 break; 9084 case Intrinsic::x86_sse_ucomilt_ss: 9085 case Intrinsic::x86_sse2_ucomilt_sd: 9086 Opc = X86ISD::UCOMI; 9087 CC = ISD::SETLT; 9088 break; 9089 case Intrinsic::x86_sse_ucomile_ss: 9090 case Intrinsic::x86_sse2_ucomile_sd: 9091 Opc = X86ISD::UCOMI; 9092 CC = ISD::SETLE; 9093 break; 9094 case Intrinsic::x86_sse_ucomigt_ss: 9095 case Intrinsic::x86_sse2_ucomigt_sd: 9096 Opc = X86ISD::UCOMI; 9097 CC = ISD::SETGT; 9098 break; 9099 case Intrinsic::x86_sse_ucomige_ss: 9100 case Intrinsic::x86_sse2_ucomige_sd: 9101 Opc = X86ISD::UCOMI; 9102 CC = ISD::SETGE; 9103 break; 9104 case Intrinsic::x86_sse_ucomineq_ss: 9105 case Intrinsic::x86_sse2_ucomineq_sd: 9106 Opc = X86ISD::UCOMI; 9107 CC = ISD::SETNE; 9108 break; 9109 } 9110 9111 SDValue LHS = Op.getOperand(1); 9112 SDValue RHS = Op.getOperand(2); 9113 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 9114 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 9115 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 9116 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9117 DAG.getConstant(X86CC, MVT::i8), Cond); 9118 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9119 } 9120 // ptest and testp intrinsics. The intrinsic these come from are designed to 9121 // return an integer value, not just an instruction so lower it to the ptest 9122 // or testp pattern and a setcc for the result. 9123 case Intrinsic::x86_sse41_ptestz: 9124 case Intrinsic::x86_sse41_ptestc: 9125 case Intrinsic::x86_sse41_ptestnzc: 9126 case Intrinsic::x86_avx_ptestz_256: 9127 case Intrinsic::x86_avx_ptestc_256: 9128 case Intrinsic::x86_avx_ptestnzc_256: 9129 case Intrinsic::x86_avx_vtestz_ps: 9130 case Intrinsic::x86_avx_vtestc_ps: 9131 case Intrinsic::x86_avx_vtestnzc_ps: 9132 case Intrinsic::x86_avx_vtestz_pd: 9133 case Intrinsic::x86_avx_vtestc_pd: 9134 case Intrinsic::x86_avx_vtestnzc_pd: 9135 case Intrinsic::x86_avx_vtestz_ps_256: 9136 case Intrinsic::x86_avx_vtestc_ps_256: 9137 case Intrinsic::x86_avx_vtestnzc_ps_256: 9138 case Intrinsic::x86_avx_vtestz_pd_256: 9139 case Intrinsic::x86_avx_vtestc_pd_256: 9140 case Intrinsic::x86_avx_vtestnzc_pd_256: { 9141 bool IsTestPacked = false; 9142 unsigned X86CC = 0; 9143 switch (IntNo) { 9144 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 9145 case Intrinsic::x86_avx_vtestz_ps: 9146 case Intrinsic::x86_avx_vtestz_pd: 9147 case Intrinsic::x86_avx_vtestz_ps_256: 9148 case Intrinsic::x86_avx_vtestz_pd_256: 9149 IsTestPacked = true; // Fallthrough 9150 case Intrinsic::x86_sse41_ptestz: 9151 case Intrinsic::x86_avx_ptestz_256: 9152 // ZF = 1 9153 X86CC = X86::COND_E; 9154 break; 9155 case Intrinsic::x86_avx_vtestc_ps: 9156 case Intrinsic::x86_avx_vtestc_pd: 9157 case Intrinsic::x86_avx_vtestc_ps_256: 9158 case Intrinsic::x86_avx_vtestc_pd_256: 9159 IsTestPacked = true; // Fallthrough 9160 case Intrinsic::x86_sse41_ptestc: 9161 case Intrinsic::x86_avx_ptestc_256: 9162 // CF = 1 9163 X86CC = X86::COND_B; 9164 break; 9165 case Intrinsic::x86_avx_vtestnzc_ps: 9166 case Intrinsic::x86_avx_vtestnzc_pd: 9167 case Intrinsic::x86_avx_vtestnzc_ps_256: 9168 case Intrinsic::x86_avx_vtestnzc_pd_256: 9169 IsTestPacked = true; // Fallthrough 9170 case Intrinsic::x86_sse41_ptestnzc: 9171 case Intrinsic::x86_avx_ptestnzc_256: 9172 // ZF and CF = 0 9173 X86CC = X86::COND_A; 9174 break; 9175 } 9176 9177 SDValue LHS = Op.getOperand(1); 9178 SDValue RHS = Op.getOperand(2); 9179 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 9180 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 9181 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 9182 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 9183 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9184 } 9185 9186 // Fix vector shift instructions where the last operand is a non-immediate 9187 // i32 value. 9188 case Intrinsic::x86_sse2_pslli_w: 9189 case Intrinsic::x86_sse2_pslli_d: 9190 case Intrinsic::x86_sse2_pslli_q: 9191 case Intrinsic::x86_sse2_psrli_w: 9192 case Intrinsic::x86_sse2_psrli_d: 9193 case Intrinsic::x86_sse2_psrli_q: 9194 case Intrinsic::x86_sse2_psrai_w: 9195 case Intrinsic::x86_sse2_psrai_d: 9196 case Intrinsic::x86_mmx_pslli_w: 9197 case Intrinsic::x86_mmx_pslli_d: 9198 case Intrinsic::x86_mmx_pslli_q: 9199 case Intrinsic::x86_mmx_psrli_w: 9200 case Intrinsic::x86_mmx_psrli_d: 9201 case Intrinsic::x86_mmx_psrli_q: 9202 case Intrinsic::x86_mmx_psrai_w: 9203 case Intrinsic::x86_mmx_psrai_d: { 9204 SDValue ShAmt = Op.getOperand(2); 9205 if (isa<ConstantSDNode>(ShAmt)) 9206 return SDValue(); 9207 9208 unsigned NewIntNo = 0; 9209 EVT ShAmtVT = MVT::v4i32; 9210 switch (IntNo) { 9211 case Intrinsic::x86_sse2_pslli_w: 9212 NewIntNo = Intrinsic::x86_sse2_psll_w; 9213 break; 9214 case Intrinsic::x86_sse2_pslli_d: 9215 NewIntNo = Intrinsic::x86_sse2_psll_d; 9216 break; 9217 case Intrinsic::x86_sse2_pslli_q: 9218 NewIntNo = Intrinsic::x86_sse2_psll_q; 9219 break; 9220 case Intrinsic::x86_sse2_psrli_w: 9221 NewIntNo = Intrinsic::x86_sse2_psrl_w; 9222 break; 9223 case Intrinsic::x86_sse2_psrli_d: 9224 NewIntNo = Intrinsic::x86_sse2_psrl_d; 9225 break; 9226 case Intrinsic::x86_sse2_psrli_q: 9227 NewIntNo = Intrinsic::x86_sse2_psrl_q; 9228 break; 9229 case Intrinsic::x86_sse2_psrai_w: 9230 NewIntNo = Intrinsic::x86_sse2_psra_w; 9231 break; 9232 case Intrinsic::x86_sse2_psrai_d: 9233 NewIntNo = Intrinsic::x86_sse2_psra_d; 9234 break; 9235 default: { 9236 ShAmtVT = MVT::v2i32; 9237 switch (IntNo) { 9238 case Intrinsic::x86_mmx_pslli_w: 9239 NewIntNo = Intrinsic::x86_mmx_psll_w; 9240 break; 9241 case Intrinsic::x86_mmx_pslli_d: 9242 NewIntNo = Intrinsic::x86_mmx_psll_d; 9243 break; 9244 case Intrinsic::x86_mmx_pslli_q: 9245 NewIntNo = Intrinsic::x86_mmx_psll_q; 9246 break; 9247 case Intrinsic::x86_mmx_psrli_w: 9248 NewIntNo = Intrinsic::x86_mmx_psrl_w; 9249 break; 9250 case Intrinsic::x86_mmx_psrli_d: 9251 NewIntNo = Intrinsic::x86_mmx_psrl_d; 9252 break; 9253 case Intrinsic::x86_mmx_psrli_q: 9254 NewIntNo = Intrinsic::x86_mmx_psrl_q; 9255 break; 9256 case Intrinsic::x86_mmx_psrai_w: 9257 NewIntNo = Intrinsic::x86_mmx_psra_w; 9258 break; 9259 case Intrinsic::x86_mmx_psrai_d: 9260 NewIntNo = Intrinsic::x86_mmx_psra_d; 9261 break; 9262 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9263 } 9264 break; 9265 } 9266 } 9267 9268 // The vector shift intrinsics with scalars uses 32b shift amounts but 9269 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 9270 // to be zero. 9271 SDValue ShOps[4]; 9272 ShOps[0] = ShAmt; 9273 ShOps[1] = DAG.getConstant(0, MVT::i32); 9274 if (ShAmtVT == MVT::v4i32) { 9275 ShOps[2] = DAG.getUNDEF(MVT::i32); 9276 ShOps[3] = DAG.getUNDEF(MVT::i32); 9277 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 9278 } else { 9279 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 9280// FIXME this must be lowered to get rid of the invalid type. 9281 } 9282 9283 EVT VT = Op.getValueType(); 9284 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 9285 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9286 DAG.getConstant(NewIntNo, MVT::i32), 9287 Op.getOperand(1), ShAmt); 9288 } 9289 } 9290} 9291 9292SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 9293 SelectionDAG &DAG) const { 9294 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9295 MFI->setReturnAddressIsTaken(true); 9296 9297 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9298 DebugLoc dl = Op.getDebugLoc(); 9299 9300 if (Depth > 0) { 9301 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 9302 SDValue Offset = 9303 DAG.getConstant(TD->getPointerSize(), 9304 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 9305 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9306 DAG.getNode(ISD::ADD, dl, getPointerTy(), 9307 FrameAddr, Offset), 9308 MachinePointerInfo(), false, false, 0); 9309 } 9310 9311 // Just load the return address. 9312 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 9313 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9314 RetAddrFI, MachinePointerInfo(), false, false, 0); 9315} 9316 9317SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 9318 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9319 MFI->setFrameAddressIsTaken(true); 9320 9321 EVT VT = Op.getValueType(); 9322 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 9323 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9324 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 9325 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 9326 while (Depth--) 9327 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 9328 MachinePointerInfo(), 9329 false, false, 0); 9330 return FrameAddr; 9331} 9332 9333SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 9334 SelectionDAG &DAG) const { 9335 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 9336} 9337 9338SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 9339 MachineFunction &MF = DAG.getMachineFunction(); 9340 SDValue Chain = Op.getOperand(0); 9341 SDValue Offset = Op.getOperand(1); 9342 SDValue Handler = Op.getOperand(2); 9343 DebugLoc dl = Op.getDebugLoc(); 9344 9345 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 9346 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 9347 getPointerTy()); 9348 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 9349 9350 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 9351 DAG.getIntPtrConstant(TD->getPointerSize())); 9352 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 9353 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 9354 false, false, 0); 9355 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 9356 MF.getRegInfo().addLiveOut(StoreAddrReg); 9357 9358 return DAG.getNode(X86ISD::EH_RETURN, dl, 9359 MVT::Other, 9360 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 9361} 9362 9363SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 9364 SelectionDAG &DAG) const { 9365 SDValue Root = Op.getOperand(0); 9366 SDValue Trmp = Op.getOperand(1); // trampoline 9367 SDValue FPtr = Op.getOperand(2); // nested function 9368 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 9369 DebugLoc dl = Op.getDebugLoc(); 9370 9371 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9372 9373 if (Subtarget->is64Bit()) { 9374 SDValue OutChains[6]; 9375 9376 // Large code-model. 9377 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 9378 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 9379 9380 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 9381 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 9382 9383 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 9384 9385 // Load the pointer to the nested function into R11. 9386 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 9387 SDValue Addr = Trmp; 9388 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9389 Addr, MachinePointerInfo(TrmpAddr), 9390 false, false, 0); 9391 9392 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9393 DAG.getConstant(2, MVT::i64)); 9394 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 9395 MachinePointerInfo(TrmpAddr, 2), 9396 false, false, 2); 9397 9398 // Load the 'nest' parameter value into R10. 9399 // R10 is specified in X86CallingConv.td 9400 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 9401 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9402 DAG.getConstant(10, MVT::i64)); 9403 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9404 Addr, MachinePointerInfo(TrmpAddr, 10), 9405 false, false, 0); 9406 9407 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9408 DAG.getConstant(12, MVT::i64)); 9409 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 9410 MachinePointerInfo(TrmpAddr, 12), 9411 false, false, 2); 9412 9413 // Jump to the nested function. 9414 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 9415 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9416 DAG.getConstant(20, MVT::i64)); 9417 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9418 Addr, MachinePointerInfo(TrmpAddr, 20), 9419 false, false, 0); 9420 9421 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 9422 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9423 DAG.getConstant(22, MVT::i64)); 9424 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 9425 MachinePointerInfo(TrmpAddr, 22), 9426 false, false, 0); 9427 9428 SDValue Ops[] = 9429 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 9430 return DAG.getMergeValues(Ops, 2, dl); 9431 } else { 9432 const Function *Func = 9433 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 9434 CallingConv::ID CC = Func->getCallingConv(); 9435 unsigned NestReg; 9436 9437 switch (CC) { 9438 default: 9439 llvm_unreachable("Unsupported calling convention"); 9440 case CallingConv::C: 9441 case CallingConv::X86_StdCall: { 9442 // Pass 'nest' parameter in ECX. 9443 // Must be kept in sync with X86CallingConv.td 9444 NestReg = X86::ECX; 9445 9446 // Check that ECX wasn't needed by an 'inreg' parameter. 9447 FunctionType *FTy = Func->getFunctionType(); 9448 const AttrListPtr &Attrs = Func->getAttributes(); 9449 9450 if (!Attrs.isEmpty() && !Func->isVarArg()) { 9451 unsigned InRegCount = 0; 9452 unsigned Idx = 1; 9453 9454 for (FunctionType::param_iterator I = FTy->param_begin(), 9455 E = FTy->param_end(); I != E; ++I, ++Idx) 9456 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 9457 // FIXME: should only count parameters that are lowered to integers. 9458 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 9459 9460 if (InRegCount > 2) { 9461 report_fatal_error("Nest register in use - reduce number of inreg" 9462 " parameters!"); 9463 } 9464 } 9465 break; 9466 } 9467 case CallingConv::X86_FastCall: 9468 case CallingConv::X86_ThisCall: 9469 case CallingConv::Fast: 9470 // Pass 'nest' parameter in EAX. 9471 // Must be kept in sync with X86CallingConv.td 9472 NestReg = X86::EAX; 9473 break; 9474 } 9475 9476 SDValue OutChains[4]; 9477 SDValue Addr, Disp; 9478 9479 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9480 DAG.getConstant(10, MVT::i32)); 9481 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 9482 9483 // This is storing the opcode for MOV32ri. 9484 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 9485 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 9486 OutChains[0] = DAG.getStore(Root, dl, 9487 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 9488 Trmp, MachinePointerInfo(TrmpAddr), 9489 false, false, 0); 9490 9491 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9492 DAG.getConstant(1, MVT::i32)); 9493 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 9494 MachinePointerInfo(TrmpAddr, 1), 9495 false, false, 1); 9496 9497 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 9498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9499 DAG.getConstant(5, MVT::i32)); 9500 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 9501 MachinePointerInfo(TrmpAddr, 5), 9502 false, false, 1); 9503 9504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9505 DAG.getConstant(6, MVT::i32)); 9506 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 9507 MachinePointerInfo(TrmpAddr, 6), 9508 false, false, 1); 9509 9510 SDValue Ops[] = 9511 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 9512 return DAG.getMergeValues(Ops, 2, dl); 9513 } 9514} 9515 9516SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 9517 SelectionDAG &DAG) const { 9518 /* 9519 The rounding mode is in bits 11:10 of FPSR, and has the following 9520 settings: 9521 00 Round to nearest 9522 01 Round to -inf 9523 10 Round to +inf 9524 11 Round to 0 9525 9526 FLT_ROUNDS, on the other hand, expects the following: 9527 -1 Undefined 9528 0 Round to 0 9529 1 Round to nearest 9530 2 Round to +inf 9531 3 Round to -inf 9532 9533 To perform the conversion, we do: 9534 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 9535 */ 9536 9537 MachineFunction &MF = DAG.getMachineFunction(); 9538 const TargetMachine &TM = MF.getTarget(); 9539 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 9540 unsigned StackAlignment = TFI.getStackAlignment(); 9541 EVT VT = Op.getValueType(); 9542 DebugLoc DL = Op.getDebugLoc(); 9543 9544 // Save FP Control Word to stack slot 9545 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 9546 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 9547 9548 9549 MachineMemOperand *MMO = 9550 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 9551 MachineMemOperand::MOStore, 2, 2); 9552 9553 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 9554 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 9555 DAG.getVTList(MVT::Other), 9556 Ops, 2, MVT::i16, MMO); 9557 9558 // Load FP Control Word from stack slot 9559 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 9560 MachinePointerInfo(), false, false, 0); 9561 9562 // Transform as necessary 9563 SDValue CWD1 = 9564 DAG.getNode(ISD::SRL, DL, MVT::i16, 9565 DAG.getNode(ISD::AND, DL, MVT::i16, 9566 CWD, DAG.getConstant(0x800, MVT::i16)), 9567 DAG.getConstant(11, MVT::i8)); 9568 SDValue CWD2 = 9569 DAG.getNode(ISD::SRL, DL, MVT::i16, 9570 DAG.getNode(ISD::AND, DL, MVT::i16, 9571 CWD, DAG.getConstant(0x400, MVT::i16)), 9572 DAG.getConstant(9, MVT::i8)); 9573 9574 SDValue RetVal = 9575 DAG.getNode(ISD::AND, DL, MVT::i16, 9576 DAG.getNode(ISD::ADD, DL, MVT::i16, 9577 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 9578 DAG.getConstant(1, MVT::i16)), 9579 DAG.getConstant(3, MVT::i16)); 9580 9581 9582 return DAG.getNode((VT.getSizeInBits() < 16 ? 9583 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 9584} 9585 9586SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9587 EVT VT = Op.getValueType(); 9588 EVT OpVT = VT; 9589 unsigned NumBits = VT.getSizeInBits(); 9590 DebugLoc dl = Op.getDebugLoc(); 9591 9592 Op = Op.getOperand(0); 9593 if (VT == MVT::i8) { 9594 // Zero extend to i32 since there is not an i8 bsr. 9595 OpVT = MVT::i32; 9596 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9597 } 9598 9599 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 9600 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9601 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9602 9603 // If src is zero (i.e. bsr sets ZF), returns NumBits. 9604 SDValue Ops[] = { 9605 Op, 9606 DAG.getConstant(NumBits+NumBits-1, OpVT), 9607 DAG.getConstant(X86::COND_E, MVT::i8), 9608 Op.getValue(1) 9609 }; 9610 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9611 9612 // Finally xor with NumBits-1. 9613 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9614 9615 if (VT == MVT::i8) 9616 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9617 return Op; 9618} 9619 9620SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9621 EVT VT = Op.getValueType(); 9622 EVT OpVT = VT; 9623 unsigned NumBits = VT.getSizeInBits(); 9624 DebugLoc dl = Op.getDebugLoc(); 9625 9626 Op = Op.getOperand(0); 9627 if (VT == MVT::i8) { 9628 OpVT = MVT::i32; 9629 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9630 } 9631 9632 // Issue a bsf (scan bits forward) which also sets EFLAGS. 9633 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9634 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 9635 9636 // If src is zero (i.e. bsf sets ZF), returns NumBits. 9637 SDValue Ops[] = { 9638 Op, 9639 DAG.getConstant(NumBits, OpVT), 9640 DAG.getConstant(X86::COND_E, MVT::i8), 9641 Op.getValue(1) 9642 }; 9643 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9644 9645 if (VT == MVT::i8) 9646 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9647 return Op; 9648} 9649 9650// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 9651// ones, and then concatenate the result back. 9652static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 9653 EVT VT = Op.getValueType(); 9654 9655 assert(VT.getSizeInBits() == 256 && VT.isInteger() && 9656 "Unsupported value type for operation"); 9657 9658 int NumElems = VT.getVectorNumElements(); 9659 DebugLoc dl = Op.getDebugLoc(); 9660 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 9661 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 9662 9663 // Extract the LHS vectors 9664 SDValue LHS = Op.getOperand(0); 9665 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 9666 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 9667 9668 // Extract the RHS vectors 9669 SDValue RHS = Op.getOperand(1); 9670 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 9671 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 9672 9673 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9674 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9675 9676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9677 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 9678 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 9679} 9680 9681SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { 9682 assert(Op.getValueType().getSizeInBits() == 256 && 9683 Op.getValueType().isInteger() && 9684 "Only handle AVX 256-bit vector integer operation"); 9685 return Lower256IntArith(Op, DAG); 9686} 9687 9688SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { 9689 assert(Op.getValueType().getSizeInBits() == 256 && 9690 Op.getValueType().isInteger() && 9691 "Only handle AVX 256-bit vector integer operation"); 9692 return Lower256IntArith(Op, DAG); 9693} 9694 9695SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 9696 EVT VT = Op.getValueType(); 9697 9698 // Decompose 256-bit ops into smaller 128-bit ops. 9699 if (VT.getSizeInBits() == 256) 9700 return Lower256IntArith(Op, DAG); 9701 9702 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 9703 DebugLoc dl = Op.getDebugLoc(); 9704 9705 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 9706 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 9707 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 9708 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 9709 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 9710 // 9711 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 9712 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 9713 // return AloBlo + AloBhi + AhiBlo; 9714 9715 SDValue A = Op.getOperand(0); 9716 SDValue B = Op.getOperand(1); 9717 9718 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9719 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9720 A, DAG.getConstant(32, MVT::i32)); 9721 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9722 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9723 B, DAG.getConstant(32, MVT::i32)); 9724 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9725 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9726 A, B); 9727 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9728 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9729 A, Bhi); 9730 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9731 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9732 Ahi, B); 9733 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9734 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9735 AloBhi, DAG.getConstant(32, MVT::i32)); 9736 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9737 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9738 AhiBlo, DAG.getConstant(32, MVT::i32)); 9739 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9740 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9741 return Res; 9742} 9743 9744SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 9745 9746 EVT VT = Op.getValueType(); 9747 DebugLoc dl = Op.getDebugLoc(); 9748 SDValue R = Op.getOperand(0); 9749 SDValue Amt = Op.getOperand(1); 9750 LLVMContext *Context = DAG.getContext(); 9751 9752 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 9753 return SDValue(); 9754 9755 // Decompose 256-bit shifts into smaller 128-bit shifts. 9756 if (VT.getSizeInBits() == 256) { 9757 int NumElems = VT.getVectorNumElements(); 9758 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9759 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9760 9761 // Extract the two vectors 9762 SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); 9763 SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), 9764 DAG, dl); 9765 9766 // Recreate the shift amount vectors 9767 SDValue Amt1, Amt2; 9768 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 9769 // Constant shift amount 9770 SmallVector<SDValue, 4> Amt1Csts; 9771 SmallVector<SDValue, 4> Amt2Csts; 9772 for (int i = 0; i < NumElems/2; ++i) 9773 Amt1Csts.push_back(Amt->getOperand(i)); 9774 for (int i = NumElems/2; i < NumElems; ++i) 9775 Amt2Csts.push_back(Amt->getOperand(i)); 9776 9777 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9778 &Amt1Csts[0], NumElems/2); 9779 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9780 &Amt2Csts[0], NumElems/2); 9781 } else { 9782 // Variable shift amount 9783 Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); 9784 Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), 9785 DAG, dl); 9786 } 9787 9788 // Issue new vector shifts for the smaller types 9789 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 9790 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 9791 9792 // Concatenate the result back 9793 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 9794 } 9795 9796 // Optimize shl/srl/sra with constant shift amount. 9797 if (isSplatVector(Amt.getNode())) { 9798 SDValue SclrAmt = Amt->getOperand(0); 9799 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 9800 uint64_t ShiftAmt = C->getZExtValue(); 9801 9802 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 9803 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9804 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9805 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9806 9807 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 9808 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9809 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9810 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9811 9812 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 9813 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9814 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9815 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9816 9817 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 9818 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9819 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9820 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9821 9822 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 9823 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9824 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9825 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9826 9827 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 9828 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9829 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9830 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9831 9832 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 9833 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9834 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9835 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9836 9837 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 9838 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9839 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9840 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9841 } 9842 } 9843 9844 // Lower SHL with variable shift amount. 9845 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 9846 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9847 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9848 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 9849 9850 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 9851 9852 std::vector<Constant*> CV(4, CI); 9853 Constant *C = ConstantVector::get(CV); 9854 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9855 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9856 MachinePointerInfo::getConstantPool(), 9857 false, false, 16); 9858 9859 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 9860 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 9861 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 9862 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 9863 } 9864 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 9865 // a = a << 5; 9866 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9867 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9868 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 9869 9870 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 9871 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 9872 9873 std::vector<Constant*> CVM1(16, CM1); 9874 std::vector<Constant*> CVM2(16, CM2); 9875 Constant *C = ConstantVector::get(CVM1); 9876 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9877 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9878 MachinePointerInfo::getConstantPool(), 9879 false, false, 16); 9880 9881 // r = pblendv(r, psllw(r & (char16)15, 4), a); 9882 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9883 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9884 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9885 DAG.getConstant(4, MVT::i32)); 9886 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9887 // a += a 9888 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9889 9890 C = ConstantVector::get(CVM2); 9891 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9892 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9893 MachinePointerInfo::getConstantPool(), 9894 false, false, 16); 9895 9896 // r = pblendv(r, psllw(r & (char16)63, 2), a); 9897 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9898 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9899 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9900 DAG.getConstant(2, MVT::i32)); 9901 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9902 // a += a 9903 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9904 9905 // return pblendv(r, r+r, a); 9906 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 9907 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 9908 return R; 9909 } 9910 return SDValue(); 9911} 9912 9913SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 9914 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 9915 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 9916 // looks for this combo and may remove the "setcc" instruction if the "setcc" 9917 // has only one use. 9918 SDNode *N = Op.getNode(); 9919 SDValue LHS = N->getOperand(0); 9920 SDValue RHS = N->getOperand(1); 9921 unsigned BaseOp = 0; 9922 unsigned Cond = 0; 9923 DebugLoc DL = Op.getDebugLoc(); 9924 switch (Op.getOpcode()) { 9925 default: llvm_unreachable("Unknown ovf instruction!"); 9926 case ISD::SADDO: 9927 // A subtract of one will be selected as a INC. Note that INC doesn't 9928 // set CF, so we can't do this for UADDO. 9929 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9930 if (C->isOne()) { 9931 BaseOp = X86ISD::INC; 9932 Cond = X86::COND_O; 9933 break; 9934 } 9935 BaseOp = X86ISD::ADD; 9936 Cond = X86::COND_O; 9937 break; 9938 case ISD::UADDO: 9939 BaseOp = X86ISD::ADD; 9940 Cond = X86::COND_B; 9941 break; 9942 case ISD::SSUBO: 9943 // A subtract of one will be selected as a DEC. Note that DEC doesn't 9944 // set CF, so we can't do this for USUBO. 9945 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9946 if (C->isOne()) { 9947 BaseOp = X86ISD::DEC; 9948 Cond = X86::COND_O; 9949 break; 9950 } 9951 BaseOp = X86ISD::SUB; 9952 Cond = X86::COND_O; 9953 break; 9954 case ISD::USUBO: 9955 BaseOp = X86ISD::SUB; 9956 Cond = X86::COND_B; 9957 break; 9958 case ISD::SMULO: 9959 BaseOp = X86ISD::SMUL; 9960 Cond = X86::COND_O; 9961 break; 9962 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 9963 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 9964 MVT::i32); 9965 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 9966 9967 SDValue SetCC = 9968 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9969 DAG.getConstant(X86::COND_O, MVT::i32), 9970 SDValue(Sum.getNode(), 2)); 9971 9972 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9973 } 9974 } 9975 9976 // Also sets EFLAGS. 9977 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 9978 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 9979 9980 SDValue SetCC = 9981 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 9982 DAG.getConstant(Cond, MVT::i32), 9983 SDValue(Sum.getNode(), 1)); 9984 9985 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9986} 9987 9988SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ 9989 DebugLoc dl = Op.getDebugLoc(); 9990 SDNode* Node = Op.getNode(); 9991 EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); 9992 EVT VT = Node->getValueType(0); 9993 9994 if (Subtarget->hasSSE2() && VT.isVector()) { 9995 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 9996 ExtraVT.getScalarType().getSizeInBits(); 9997 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 9998 9999 unsigned SHLIntrinsicsID = 0; 10000 unsigned SRAIntrinsicsID = 0; 10001 switch (VT.getSimpleVT().SimpleTy) { 10002 default: 10003 return SDValue(); 10004 case MVT::v2i64: { 10005 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q; 10006 SRAIntrinsicsID = 0; 10007 break; 10008 } 10009 case MVT::v4i32: { 10010 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 10011 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 10012 break; 10013 } 10014 case MVT::v8i16: { 10015 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 10016 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 10017 break; 10018 } 10019 } 10020 10021 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10022 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 10023 Node->getOperand(0), ShAmt); 10024 10025 // In case of 1 bit sext, no need to shr 10026 if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; 10027 10028 if (SRAIntrinsicsID) { 10029 Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10030 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 10031 Tmp1, ShAmt); 10032 } 10033 return Tmp1; 10034 } 10035 10036 return SDValue(); 10037} 10038 10039 10040SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 10041 DebugLoc dl = Op.getDebugLoc(); 10042 10043 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 10044 // There isn't any reason to disable it if the target processor supports it. 10045 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 10046 SDValue Chain = Op.getOperand(0); 10047 SDValue Zero = DAG.getConstant(0, MVT::i32); 10048 SDValue Ops[] = { 10049 DAG.getRegister(X86::ESP, MVT::i32), // Base 10050 DAG.getTargetConstant(1, MVT::i8), // Scale 10051 DAG.getRegister(0, MVT::i32), // Index 10052 DAG.getTargetConstant(0, MVT::i32), // Disp 10053 DAG.getRegister(0, MVT::i32), // Segment. 10054 Zero, 10055 Chain 10056 }; 10057 SDNode *Res = 10058 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10059 array_lengthof(Ops)); 10060 return SDValue(Res, 0); 10061 } 10062 10063 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 10064 if (!isDev) 10065 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10066 10067 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10068 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10069 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 10070 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 10071 10072 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 10073 if (!Op1 && !Op2 && !Op3 && Op4) 10074 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 10075 10076 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 10077 if (Op1 && !Op2 && !Op3 && !Op4) 10078 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 10079 10080 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 10081 // (MFENCE)>; 10082 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10083} 10084 10085SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 10086 SelectionDAG &DAG) const { 10087 DebugLoc dl = Op.getDebugLoc(); 10088 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 10089 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 10090 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 10091 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 10092 10093 // The only fence that needs an instruction is a sequentially-consistent 10094 // cross-thread fence. 10095 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 10096 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 10097 // no-sse2). There isn't any reason to disable it if the target processor 10098 // supports it. 10099 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 10100 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10101 10102 SDValue Chain = Op.getOperand(0); 10103 SDValue Zero = DAG.getConstant(0, MVT::i32); 10104 SDValue Ops[] = { 10105 DAG.getRegister(X86::ESP, MVT::i32), // Base 10106 DAG.getTargetConstant(1, MVT::i8), // Scale 10107 DAG.getRegister(0, MVT::i32), // Index 10108 DAG.getTargetConstant(0, MVT::i32), // Disp 10109 DAG.getRegister(0, MVT::i32), // Segment. 10110 Zero, 10111 Chain 10112 }; 10113 SDNode *Res = 10114 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10115 array_lengthof(Ops)); 10116 return SDValue(Res, 0); 10117 } 10118 10119 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 10120 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10121} 10122 10123 10124SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 10125 EVT T = Op.getValueType(); 10126 DebugLoc DL = Op.getDebugLoc(); 10127 unsigned Reg = 0; 10128 unsigned size = 0; 10129 switch(T.getSimpleVT().SimpleTy) { 10130 default: 10131 assert(false && "Invalid value type!"); 10132 case MVT::i8: Reg = X86::AL; size = 1; break; 10133 case MVT::i16: Reg = X86::AX; size = 2; break; 10134 case MVT::i32: Reg = X86::EAX; size = 4; break; 10135 case MVT::i64: 10136 assert(Subtarget->is64Bit() && "Node not type legal!"); 10137 Reg = X86::RAX; size = 8; 10138 break; 10139 } 10140 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 10141 Op.getOperand(2), SDValue()); 10142 SDValue Ops[] = { cpIn.getValue(0), 10143 Op.getOperand(1), 10144 Op.getOperand(3), 10145 DAG.getTargetConstant(size, MVT::i8), 10146 cpIn.getValue(1) }; 10147 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10148 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 10149 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 10150 Ops, 5, T, MMO); 10151 SDValue cpOut = 10152 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 10153 return cpOut; 10154} 10155 10156SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 10157 SelectionDAG &DAG) const { 10158 assert(Subtarget->is64Bit() && "Result not type legalized?"); 10159 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10160 SDValue TheChain = Op.getOperand(0); 10161 DebugLoc dl = Op.getDebugLoc(); 10162 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10163 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 10164 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 10165 rax.getValue(2)); 10166 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 10167 DAG.getConstant(32, MVT::i8)); 10168 SDValue Ops[] = { 10169 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 10170 rdx.getValue(1) 10171 }; 10172 return DAG.getMergeValues(Ops, 2, dl); 10173} 10174 10175SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 10176 SelectionDAG &DAG) const { 10177 EVT SrcVT = Op.getOperand(0).getValueType(); 10178 EVT DstVT = Op.getValueType(); 10179 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 10180 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 10181 assert((DstVT == MVT::i64 || 10182 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 10183 "Unexpected custom BITCAST"); 10184 // i64 <=> MMX conversions are Legal. 10185 if (SrcVT==MVT::i64 && DstVT.isVector()) 10186 return Op; 10187 if (DstVT==MVT::i64 && SrcVT.isVector()) 10188 return Op; 10189 // MMX <=> MMX conversions are Legal. 10190 if (SrcVT.isVector() && DstVT.isVector()) 10191 return Op; 10192 // All other conversions need to be expanded. 10193 return SDValue(); 10194} 10195 10196SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 10197 SDNode *Node = Op.getNode(); 10198 DebugLoc dl = Node->getDebugLoc(); 10199 EVT T = Node->getValueType(0); 10200 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 10201 DAG.getConstant(0, T), Node->getOperand(2)); 10202 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 10203 cast<AtomicSDNode>(Node)->getMemoryVT(), 10204 Node->getOperand(0), 10205 Node->getOperand(1), negOp, 10206 cast<AtomicSDNode>(Node)->getSrcValue(), 10207 cast<AtomicSDNode>(Node)->getAlignment(), 10208 cast<AtomicSDNode>(Node)->getOrdering(), 10209 cast<AtomicSDNode>(Node)->getSynchScope()); 10210} 10211 10212static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 10213 SDNode *Node = Op.getNode(); 10214 DebugLoc dl = Node->getDebugLoc(); 10215 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10216 10217 // Convert seq_cst store -> xchg 10218 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 10219 // FIXME: On 32-bit, store -> fist or movq would be more efficient 10220 // (The only way to get a 16-byte store is cmpxchg16b) 10221 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 10222 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 10223 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 10224 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 10225 cast<AtomicSDNode>(Node)->getMemoryVT(), 10226 Node->getOperand(0), 10227 Node->getOperand(1), Node->getOperand(2), 10228 cast<AtomicSDNode>(Node)->getMemOperand(), 10229 cast<AtomicSDNode>(Node)->getOrdering(), 10230 cast<AtomicSDNode>(Node)->getSynchScope()); 10231 return Swap.getValue(1); 10232 } 10233 // Other atomic stores have a simple pattern. 10234 return Op; 10235} 10236 10237static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 10238 EVT VT = Op.getNode()->getValueType(0); 10239 10240 // Let legalize expand this if it isn't a legal type yet. 10241 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10242 return SDValue(); 10243 10244 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10245 10246 unsigned Opc; 10247 bool ExtraOp = false; 10248 switch (Op.getOpcode()) { 10249 default: assert(0 && "Invalid code"); 10250 case ISD::ADDC: Opc = X86ISD::ADD; break; 10251 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 10252 case ISD::SUBC: Opc = X86ISD::SUB; break; 10253 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 10254 } 10255 10256 if (!ExtraOp) 10257 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10258 Op.getOperand(1)); 10259 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10260 Op.getOperand(1), Op.getOperand(2)); 10261} 10262 10263/// LowerOperation - Provide custom lowering hooks for some operations. 10264/// 10265SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10266 switch (Op.getOpcode()) { 10267 default: llvm_unreachable("Should not custom lower this!"); 10268 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 10269 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 10270 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 10271 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 10272 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 10273 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 10274 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 10275 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 10276 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 10277 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 10278 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10279 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 10280 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 10281 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 10282 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10283 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10284 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10285 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 10286 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10287 case ISD::SHL_PARTS: 10288 case ISD::SRA_PARTS: 10289 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 10290 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 10291 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 10292 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 10293 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 10294 case ISD::FABS: return LowerFABS(Op, DAG); 10295 case ISD::FNEG: return LowerFNEG(Op, DAG); 10296 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10297 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 10298 case ISD::SETCC: return LowerSETCC(Op, DAG); 10299 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 10300 case ISD::SELECT: return LowerSELECT(Op, DAG); 10301 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10302 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 10303 case ISD::VASTART: return LowerVASTART(Op, DAG); 10304 case ISD::VAARG: return LowerVAARG(Op, DAG); 10305 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 10306 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 10307 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10308 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10309 case ISD::FRAME_TO_ARGS_OFFSET: 10310 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 10311 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 10312 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 10313 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 10314 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10315 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 10316 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 10317 case ISD::MUL: return LowerMUL(Op, DAG); 10318 case ISD::SRA: 10319 case ISD::SRL: 10320 case ISD::SHL: return LowerShift(Op, DAG); 10321 case ISD::SADDO: 10322 case ISD::UADDO: 10323 case ISD::SSUBO: 10324 case ISD::USUBO: 10325 case ISD::SMULO: 10326 case ISD::UMULO: return LowerXALUO(Op, DAG); 10327 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 10328 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 10329 case ISD::ADDC: 10330 case ISD::ADDE: 10331 case ISD::SUBC: 10332 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 10333 case ISD::ADD: return LowerADD(Op, DAG); 10334 case ISD::SUB: return LowerSUB(Op, DAG); 10335 } 10336} 10337 10338static void ReplaceATOMIC_LOAD(SDNode *Node, 10339 SmallVectorImpl<SDValue> &Results, 10340 SelectionDAG &DAG) { 10341 DebugLoc dl = Node->getDebugLoc(); 10342 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10343 10344 // Convert wide load -> cmpxchg8b/cmpxchg16b 10345 // FIXME: On 32-bit, load -> fild or movq would be more efficient 10346 // (The only way to get a 16-byte load is cmpxchg16b) 10347 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 10348 SDValue Zero = DAG.getConstant(0, VT); 10349 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 10350 Node->getOperand(0), 10351 Node->getOperand(1), Zero, Zero, 10352 cast<AtomicSDNode>(Node)->getMemOperand(), 10353 cast<AtomicSDNode>(Node)->getOrdering(), 10354 cast<AtomicSDNode>(Node)->getSynchScope()); 10355 Results.push_back(Swap.getValue(0)); 10356 Results.push_back(Swap.getValue(1)); 10357} 10358 10359void X86TargetLowering:: 10360ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 10361 SelectionDAG &DAG, unsigned NewOp) const { 10362 EVT T = Node->getValueType(0); 10363 DebugLoc dl = Node->getDebugLoc(); 10364 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 10365 10366 SDValue Chain = Node->getOperand(0); 10367 SDValue In1 = Node->getOperand(1); 10368 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10369 Node->getOperand(2), DAG.getIntPtrConstant(0)); 10370 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10371 Node->getOperand(2), DAG.getIntPtrConstant(1)); 10372 SDValue Ops[] = { Chain, In1, In2L, In2H }; 10373 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 10374 SDValue Result = 10375 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 10376 cast<MemSDNode>(Node)->getMemOperand()); 10377 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 10378 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 10379 Results.push_back(Result.getValue(2)); 10380} 10381 10382/// ReplaceNodeResults - Replace a node with an illegal result type 10383/// with a new node built out of custom code. 10384void X86TargetLowering::ReplaceNodeResults(SDNode *N, 10385 SmallVectorImpl<SDValue>&Results, 10386 SelectionDAG &DAG) const { 10387 DebugLoc dl = N->getDebugLoc(); 10388 switch (N->getOpcode()) { 10389 default: 10390 assert(false && "Do not know how to custom type legalize this operation!"); 10391 return; 10392 case ISD::SIGN_EXTEND_INREG: 10393 case ISD::ADDC: 10394 case ISD::ADDE: 10395 case ISD::SUBC: 10396 case ISD::SUBE: 10397 // We don't want to expand or promote these. 10398 return; 10399 case ISD::FP_TO_SINT: { 10400 std::pair<SDValue,SDValue> Vals = 10401 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 10402 SDValue FIST = Vals.first, StackSlot = Vals.second; 10403 if (FIST.getNode() != 0) { 10404 EVT VT = N->getValueType(0); 10405 // Return a load from the stack slot. 10406 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 10407 MachinePointerInfo(), false, false, 0)); 10408 } 10409 return; 10410 } 10411 case ISD::READCYCLECOUNTER: { 10412 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10413 SDValue TheChain = N->getOperand(0); 10414 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10415 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 10416 rd.getValue(1)); 10417 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 10418 eax.getValue(2)); 10419 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 10420 SDValue Ops[] = { eax, edx }; 10421 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 10422 Results.push_back(edx.getValue(1)); 10423 return; 10424 } 10425 case ISD::ATOMIC_CMP_SWAP: { 10426 EVT T = N->getValueType(0); 10427 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 10428 bool Regs64bit = T == MVT::i128; 10429 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 10430 SDValue cpInL, cpInH; 10431 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10432 DAG.getConstant(0, HalfT)); 10433 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10434 DAG.getConstant(1, HalfT)); 10435 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 10436 Regs64bit ? X86::RAX : X86::EAX, 10437 cpInL, SDValue()); 10438 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 10439 Regs64bit ? X86::RDX : X86::EDX, 10440 cpInH, cpInL.getValue(1)); 10441 SDValue swapInL, swapInH; 10442 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10443 DAG.getConstant(0, HalfT)); 10444 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10445 DAG.getConstant(1, HalfT)); 10446 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 10447 Regs64bit ? X86::RBX : X86::EBX, 10448 swapInL, cpInH.getValue(1)); 10449 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 10450 Regs64bit ? X86::RCX : X86::ECX, 10451 swapInH, swapInL.getValue(1)); 10452 SDValue Ops[] = { swapInH.getValue(0), 10453 N->getOperand(1), 10454 swapInH.getValue(1) }; 10455 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10456 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 10457 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 10458 X86ISD::LCMPXCHG8_DAG; 10459 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 10460 Ops, 3, T, MMO); 10461 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 10462 Regs64bit ? X86::RAX : X86::EAX, 10463 HalfT, Result.getValue(1)); 10464 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 10465 Regs64bit ? X86::RDX : X86::EDX, 10466 HalfT, cpOutL.getValue(2)); 10467 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 10468 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 10469 Results.push_back(cpOutH.getValue(1)); 10470 return; 10471 } 10472 case ISD::ATOMIC_LOAD_ADD: 10473 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 10474 return; 10475 case ISD::ATOMIC_LOAD_AND: 10476 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 10477 return; 10478 case ISD::ATOMIC_LOAD_NAND: 10479 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 10480 return; 10481 case ISD::ATOMIC_LOAD_OR: 10482 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 10483 return; 10484 case ISD::ATOMIC_LOAD_SUB: 10485 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 10486 return; 10487 case ISD::ATOMIC_LOAD_XOR: 10488 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 10489 return; 10490 case ISD::ATOMIC_SWAP: 10491 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 10492 return; 10493 case ISD::ATOMIC_LOAD: 10494 ReplaceATOMIC_LOAD(N, Results, DAG); 10495 } 10496} 10497 10498const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 10499 switch (Opcode) { 10500 default: return NULL; 10501 case X86ISD::BSF: return "X86ISD::BSF"; 10502 case X86ISD::BSR: return "X86ISD::BSR"; 10503 case X86ISD::SHLD: return "X86ISD::SHLD"; 10504 case X86ISD::SHRD: return "X86ISD::SHRD"; 10505 case X86ISD::FAND: return "X86ISD::FAND"; 10506 case X86ISD::FOR: return "X86ISD::FOR"; 10507 case X86ISD::FXOR: return "X86ISD::FXOR"; 10508 case X86ISD::FSRL: return "X86ISD::FSRL"; 10509 case X86ISD::FILD: return "X86ISD::FILD"; 10510 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 10511 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 10512 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 10513 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 10514 case X86ISD::FLD: return "X86ISD::FLD"; 10515 case X86ISD::FST: return "X86ISD::FST"; 10516 case X86ISD::CALL: return "X86ISD::CALL"; 10517 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 10518 case X86ISD::BT: return "X86ISD::BT"; 10519 case X86ISD::CMP: return "X86ISD::CMP"; 10520 case X86ISD::COMI: return "X86ISD::COMI"; 10521 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 10522 case X86ISD::SETCC: return "X86ISD::SETCC"; 10523 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 10524 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 10525 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 10526 case X86ISD::CMOV: return "X86ISD::CMOV"; 10527 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 10528 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 10529 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 10530 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 10531 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 10532 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 10533 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 10534 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 10535 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 10536 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 10537 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 10538 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 10539 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 10540 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 10541 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 10542 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 10543 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 10544 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 10545 case X86ISD::FMAX: return "X86ISD::FMAX"; 10546 case X86ISD::FMIN: return "X86ISD::FMIN"; 10547 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 10548 case X86ISD::FRCP: return "X86ISD::FRCP"; 10549 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 10550 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 10551 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 10552 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 10553 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 10554 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 10555 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 10556 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 10557 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 10558 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 10559 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 10560 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 10561 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 10562 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 10563 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 10564 case X86ISD::VSHL: return "X86ISD::VSHL"; 10565 case X86ISD::VSRL: return "X86ISD::VSRL"; 10566 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 10567 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 10568 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 10569 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 10570 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 10571 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 10572 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 10573 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 10574 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 10575 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 10576 case X86ISD::ADD: return "X86ISD::ADD"; 10577 case X86ISD::SUB: return "X86ISD::SUB"; 10578 case X86ISD::ADC: return "X86ISD::ADC"; 10579 case X86ISD::SBB: return "X86ISD::SBB"; 10580 case X86ISD::SMUL: return "X86ISD::SMUL"; 10581 case X86ISD::UMUL: return "X86ISD::UMUL"; 10582 case X86ISD::INC: return "X86ISD::INC"; 10583 case X86ISD::DEC: return "X86ISD::DEC"; 10584 case X86ISD::OR: return "X86ISD::OR"; 10585 case X86ISD::XOR: return "X86ISD::XOR"; 10586 case X86ISD::AND: return "X86ISD::AND"; 10587 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 10588 case X86ISD::PTEST: return "X86ISD::PTEST"; 10589 case X86ISD::TESTP: return "X86ISD::TESTP"; 10590 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 10591 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 10592 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 10593 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 10594 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 10595 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 10596 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 10597 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 10598 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 10599 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 10600 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 10601 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 10602 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 10603 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 10604 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 10605 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 10606 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 10607 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 10608 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 10609 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 10610 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 10611 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 10612 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 10613 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 10614 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 10615 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 10616 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 10617 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 10618 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 10619 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 10620 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 10621 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 10622 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 10623 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 10624 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 10625 case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; 10626 case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; 10627 case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; 10628 case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; 10629 case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; 10630 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 10631 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 10632 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 10633 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 10634 } 10635} 10636 10637// isLegalAddressingMode - Return true if the addressing mode represented 10638// by AM is legal for this target, for a load/store of the specified type. 10639bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 10640 Type *Ty) const { 10641 // X86 supports extremely general addressing modes. 10642 CodeModel::Model M = getTargetMachine().getCodeModel(); 10643 Reloc::Model R = getTargetMachine().getRelocationModel(); 10644 10645 // X86 allows a sign-extended 32-bit immediate field as a displacement. 10646 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 10647 return false; 10648 10649 if (AM.BaseGV) { 10650 unsigned GVFlags = 10651 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 10652 10653 // If a reference to this global requires an extra load, we can't fold it. 10654 if (isGlobalStubReference(GVFlags)) 10655 return false; 10656 10657 // If BaseGV requires a register for the PIC base, we cannot also have a 10658 // BaseReg specified. 10659 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 10660 return false; 10661 10662 // If lower 4G is not available, then we must use rip-relative addressing. 10663 if ((M != CodeModel::Small || R != Reloc::Static) && 10664 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 10665 return false; 10666 } 10667 10668 switch (AM.Scale) { 10669 case 0: 10670 case 1: 10671 case 2: 10672 case 4: 10673 case 8: 10674 // These scales always work. 10675 break; 10676 case 3: 10677 case 5: 10678 case 9: 10679 // These scales are formed with basereg+scalereg. Only accept if there is 10680 // no basereg yet. 10681 if (AM.HasBaseReg) 10682 return false; 10683 break; 10684 default: // Other stuff never works. 10685 return false; 10686 } 10687 10688 return true; 10689} 10690 10691 10692bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 10693 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 10694 return false; 10695 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 10696 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 10697 if (NumBits1 <= NumBits2) 10698 return false; 10699 return true; 10700} 10701 10702bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 10703 if (!VT1.isInteger() || !VT2.isInteger()) 10704 return false; 10705 unsigned NumBits1 = VT1.getSizeInBits(); 10706 unsigned NumBits2 = VT2.getSizeInBits(); 10707 if (NumBits1 <= NumBits2) 10708 return false; 10709 return true; 10710} 10711 10712bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 10713 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10714 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 10715} 10716 10717bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 10718 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10719 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 10720} 10721 10722bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 10723 // i16 instructions are longer (0x66 prefix) and potentially slower. 10724 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 10725} 10726 10727/// isShuffleMaskLegal - Targets can use this to indicate that they only 10728/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 10729/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 10730/// are assumed to be legal. 10731bool 10732X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 10733 EVT VT) const { 10734 // Very little shuffling can be done for 64-bit vectors right now. 10735 if (VT.getSizeInBits() == 64) 10736 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 10737 10738 // FIXME: pshufb, blends, shifts. 10739 return (VT.getVectorNumElements() == 2 || 10740 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 10741 isMOVLMask(M, VT) || 10742 isSHUFPMask(M, VT) || 10743 isPSHUFDMask(M, VT) || 10744 isPSHUFHWMask(M, VT) || 10745 isPSHUFLWMask(M, VT) || 10746 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 10747 isUNPCKLMask(M, VT) || 10748 isUNPCKHMask(M, VT) || 10749 isUNPCKL_v_undef_Mask(M, VT) || 10750 isUNPCKH_v_undef_Mask(M, VT)); 10751} 10752 10753bool 10754X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 10755 EVT VT) const { 10756 unsigned NumElts = VT.getVectorNumElements(); 10757 // FIXME: This collection of masks seems suspect. 10758 if (NumElts == 2) 10759 return true; 10760 if (NumElts == 4 && VT.getSizeInBits() == 128) { 10761 return (isMOVLMask(Mask, VT) || 10762 isCommutedMOVLMask(Mask, VT, true) || 10763 isSHUFPMask(Mask, VT) || 10764 isCommutedSHUFPMask(Mask, VT)); 10765 } 10766 return false; 10767} 10768 10769//===----------------------------------------------------------------------===// 10770// X86 Scheduler Hooks 10771//===----------------------------------------------------------------------===// 10772 10773// private utility function 10774MachineBasicBlock * 10775X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 10776 MachineBasicBlock *MBB, 10777 unsigned regOpc, 10778 unsigned immOpc, 10779 unsigned LoadOpc, 10780 unsigned CXchgOpc, 10781 unsigned notOpc, 10782 unsigned EAXreg, 10783 TargetRegisterClass *RC, 10784 bool invSrc) const { 10785 // For the atomic bitwise operator, we generate 10786 // thisMBB: 10787 // newMBB: 10788 // ld t1 = [bitinstr.addr] 10789 // op t2 = t1, [bitinstr.val] 10790 // mov EAX = t1 10791 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10792 // bz newMBB 10793 // fallthrough -->nextMBB 10794 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10795 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10796 MachineFunction::iterator MBBIter = MBB; 10797 ++MBBIter; 10798 10799 /// First build the CFG 10800 MachineFunction *F = MBB->getParent(); 10801 MachineBasicBlock *thisMBB = MBB; 10802 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10803 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10804 F->insert(MBBIter, newMBB); 10805 F->insert(MBBIter, nextMBB); 10806 10807 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10808 nextMBB->splice(nextMBB->begin(), thisMBB, 10809 llvm::next(MachineBasicBlock::iterator(bInstr)), 10810 thisMBB->end()); 10811 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10812 10813 // Update thisMBB to fall through to newMBB 10814 thisMBB->addSuccessor(newMBB); 10815 10816 // newMBB jumps to itself and fall through to nextMBB 10817 newMBB->addSuccessor(nextMBB); 10818 newMBB->addSuccessor(newMBB); 10819 10820 // Insert instructions into newMBB based on incoming instruction 10821 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10822 "unexpected number of operands"); 10823 DebugLoc dl = bInstr->getDebugLoc(); 10824 MachineOperand& destOper = bInstr->getOperand(0); 10825 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10826 int numArgs = bInstr->getNumOperands() - 1; 10827 for (int i=0; i < numArgs; ++i) 10828 argOpers[i] = &bInstr->getOperand(i+1); 10829 10830 // x86 address has 4 operands: base, index, scale, and displacement 10831 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10832 int valArgIndx = lastAddrIndx + 1; 10833 10834 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10835 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 10836 for (int i=0; i <= lastAddrIndx; ++i) 10837 (*MIB).addOperand(*argOpers[i]); 10838 10839 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 10840 if (invSrc) { 10841 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 10842 } 10843 else 10844 tt = t1; 10845 10846 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10847 assert((argOpers[valArgIndx]->isReg() || 10848 argOpers[valArgIndx]->isImm()) && 10849 "invalid operand"); 10850 if (argOpers[valArgIndx]->isReg()) 10851 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 10852 else 10853 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 10854 MIB.addReg(tt); 10855 (*MIB).addOperand(*argOpers[valArgIndx]); 10856 10857 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 10858 MIB.addReg(t1); 10859 10860 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 10861 for (int i=0; i <= lastAddrIndx; ++i) 10862 (*MIB).addOperand(*argOpers[i]); 10863 MIB.addReg(t2); 10864 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10865 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10866 bInstr->memoperands_end()); 10867 10868 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10869 MIB.addReg(EAXreg); 10870 10871 // insert branch 10872 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10873 10874 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10875 return nextMBB; 10876} 10877 10878// private utility function: 64 bit atomics on 32 bit host. 10879MachineBasicBlock * 10880X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 10881 MachineBasicBlock *MBB, 10882 unsigned regOpcL, 10883 unsigned regOpcH, 10884 unsigned immOpcL, 10885 unsigned immOpcH, 10886 bool invSrc) const { 10887 // For the atomic bitwise operator, we generate 10888 // thisMBB (instructions are in pairs, except cmpxchg8b) 10889 // ld t1,t2 = [bitinstr.addr] 10890 // newMBB: 10891 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 10892 // op t5, t6 <- out1, out2, [bitinstr.val] 10893 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 10894 // mov ECX, EBX <- t5, t6 10895 // mov EAX, EDX <- t1, t2 10896 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 10897 // mov t3, t4 <- EAX, EDX 10898 // bz newMBB 10899 // result in out1, out2 10900 // fallthrough -->nextMBB 10901 10902 const TargetRegisterClass *RC = X86::GR32RegisterClass; 10903 const unsigned LoadOpc = X86::MOV32rm; 10904 const unsigned NotOpc = X86::NOT32r; 10905 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10906 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10907 MachineFunction::iterator MBBIter = MBB; 10908 ++MBBIter; 10909 10910 /// First build the CFG 10911 MachineFunction *F = MBB->getParent(); 10912 MachineBasicBlock *thisMBB = MBB; 10913 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10914 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10915 F->insert(MBBIter, newMBB); 10916 F->insert(MBBIter, nextMBB); 10917 10918 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10919 nextMBB->splice(nextMBB->begin(), thisMBB, 10920 llvm::next(MachineBasicBlock::iterator(bInstr)), 10921 thisMBB->end()); 10922 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10923 10924 // Update thisMBB to fall through to newMBB 10925 thisMBB->addSuccessor(newMBB); 10926 10927 // newMBB jumps to itself and fall through to nextMBB 10928 newMBB->addSuccessor(nextMBB); 10929 newMBB->addSuccessor(newMBB); 10930 10931 DebugLoc dl = bInstr->getDebugLoc(); 10932 // Insert instructions into newMBB based on incoming instruction 10933 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 10934 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 10935 "unexpected number of operands"); 10936 MachineOperand& dest1Oper = bInstr->getOperand(0); 10937 MachineOperand& dest2Oper = bInstr->getOperand(1); 10938 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10939 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 10940 argOpers[i] = &bInstr->getOperand(i+2); 10941 10942 // We use some of the operands multiple times, so conservatively just 10943 // clear any kill flags that might be present. 10944 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 10945 argOpers[i]->setIsKill(false); 10946 } 10947 10948 // x86 address has 5 operands: base, index, scale, displacement, and segment. 10949 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10950 10951 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10952 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 10953 for (int i=0; i <= lastAddrIndx; ++i) 10954 (*MIB).addOperand(*argOpers[i]); 10955 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10956 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 10957 // add 4 to displacement. 10958 for (int i=0; i <= lastAddrIndx-2; ++i) 10959 (*MIB).addOperand(*argOpers[i]); 10960 MachineOperand newOp3 = *(argOpers[3]); 10961 if (newOp3.isImm()) 10962 newOp3.setImm(newOp3.getImm()+4); 10963 else 10964 newOp3.setOffset(newOp3.getOffset()+4); 10965 (*MIB).addOperand(newOp3); 10966 (*MIB).addOperand(*argOpers[lastAddrIndx]); 10967 10968 // t3/4 are defined later, at the bottom of the loop 10969 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 10970 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 10971 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 10972 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 10973 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 10974 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 10975 10976 // The subsequent operations should be using the destination registers of 10977 //the PHI instructions. 10978 if (invSrc) { 10979 t1 = F->getRegInfo().createVirtualRegister(RC); 10980 t2 = F->getRegInfo().createVirtualRegister(RC); 10981 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 10982 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 10983 } else { 10984 t1 = dest1Oper.getReg(); 10985 t2 = dest2Oper.getReg(); 10986 } 10987 10988 int valArgIndx = lastAddrIndx + 1; 10989 assert((argOpers[valArgIndx]->isReg() || 10990 argOpers[valArgIndx]->isImm()) && 10991 "invalid operand"); 10992 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 10993 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 10994 if (argOpers[valArgIndx]->isReg()) 10995 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 10996 else 10997 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 10998 if (regOpcL != X86::MOV32rr) 10999 MIB.addReg(t1); 11000 (*MIB).addOperand(*argOpers[valArgIndx]); 11001 assert(argOpers[valArgIndx + 1]->isReg() == 11002 argOpers[valArgIndx]->isReg()); 11003 assert(argOpers[valArgIndx + 1]->isImm() == 11004 argOpers[valArgIndx]->isImm()); 11005 if (argOpers[valArgIndx + 1]->isReg()) 11006 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 11007 else 11008 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 11009 if (regOpcH != X86::MOV32rr) 11010 MIB.addReg(t2); 11011 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 11012 11013 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11014 MIB.addReg(t1); 11015 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 11016 MIB.addReg(t2); 11017 11018 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 11019 MIB.addReg(t5); 11020 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 11021 MIB.addReg(t6); 11022 11023 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 11024 for (int i=0; i <= lastAddrIndx; ++i) 11025 (*MIB).addOperand(*argOpers[i]); 11026 11027 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11028 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11029 bInstr->memoperands_end()); 11030 11031 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 11032 MIB.addReg(X86::EAX); 11033 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 11034 MIB.addReg(X86::EDX); 11035 11036 // insert branch 11037 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11038 11039 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11040 return nextMBB; 11041} 11042 11043// private utility function 11044MachineBasicBlock * 11045X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 11046 MachineBasicBlock *MBB, 11047 unsigned cmovOpc) const { 11048 // For the atomic min/max operator, we generate 11049 // thisMBB: 11050 // newMBB: 11051 // ld t1 = [min/max.addr] 11052 // mov t2 = [min/max.val] 11053 // cmp t1, t2 11054 // cmov[cond] t2 = t1 11055 // mov EAX = t1 11056 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 11057 // bz newMBB 11058 // fallthrough -->nextMBB 11059 // 11060 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11061 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11062 MachineFunction::iterator MBBIter = MBB; 11063 ++MBBIter; 11064 11065 /// First build the CFG 11066 MachineFunction *F = MBB->getParent(); 11067 MachineBasicBlock *thisMBB = MBB; 11068 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11069 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11070 F->insert(MBBIter, newMBB); 11071 F->insert(MBBIter, nextMBB); 11072 11073 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11074 nextMBB->splice(nextMBB->begin(), thisMBB, 11075 llvm::next(MachineBasicBlock::iterator(mInstr)), 11076 thisMBB->end()); 11077 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11078 11079 // Update thisMBB to fall through to newMBB 11080 thisMBB->addSuccessor(newMBB); 11081 11082 // newMBB jumps to newMBB and fall through to nextMBB 11083 newMBB->addSuccessor(nextMBB); 11084 newMBB->addSuccessor(newMBB); 11085 11086 DebugLoc dl = mInstr->getDebugLoc(); 11087 // Insert instructions into newMBB based on incoming instruction 11088 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11089 "unexpected number of operands"); 11090 MachineOperand& destOper = mInstr->getOperand(0); 11091 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11092 int numArgs = mInstr->getNumOperands() - 1; 11093 for (int i=0; i < numArgs; ++i) 11094 argOpers[i] = &mInstr->getOperand(i+1); 11095 11096 // x86 address has 4 operands: base, index, scale, and displacement 11097 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11098 int valArgIndx = lastAddrIndx + 1; 11099 11100 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11101 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 11102 for (int i=0; i <= lastAddrIndx; ++i) 11103 (*MIB).addOperand(*argOpers[i]); 11104 11105 // We only support register and immediate values 11106 assert((argOpers[valArgIndx]->isReg() || 11107 argOpers[valArgIndx]->isImm()) && 11108 "invalid operand"); 11109 11110 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11111 if (argOpers[valArgIndx]->isReg()) 11112 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 11113 else 11114 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 11115 (*MIB).addOperand(*argOpers[valArgIndx]); 11116 11117 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11118 MIB.addReg(t1); 11119 11120 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 11121 MIB.addReg(t1); 11122 MIB.addReg(t2); 11123 11124 // Generate movc 11125 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11126 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 11127 MIB.addReg(t2); 11128 MIB.addReg(t1); 11129 11130 // Cmp and exchange if none has modified the memory location 11131 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 11132 for (int i=0; i <= lastAddrIndx; ++i) 11133 (*MIB).addOperand(*argOpers[i]); 11134 MIB.addReg(t3); 11135 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11136 (*MIB).setMemRefs(mInstr->memoperands_begin(), 11137 mInstr->memoperands_end()); 11138 11139 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11140 MIB.addReg(X86::EAX); 11141 11142 // insert branch 11143 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11144 11145 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 11146 return nextMBB; 11147} 11148 11149// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 11150// or XMM0_V32I8 in AVX all of this code can be replaced with that 11151// in the .td file. 11152MachineBasicBlock * 11153X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 11154 unsigned numArgs, bool memArg) const { 11155 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 11156 "Target must have SSE4.2 or AVX features enabled"); 11157 11158 DebugLoc dl = MI->getDebugLoc(); 11159 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11160 unsigned Opc; 11161 if (!Subtarget->hasAVX()) { 11162 if (memArg) 11163 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 11164 else 11165 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 11166 } else { 11167 if (memArg) 11168 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 11169 else 11170 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 11171 } 11172 11173 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 11174 for (unsigned i = 0; i < numArgs; ++i) { 11175 MachineOperand &Op = MI->getOperand(i+1); 11176 if (!(Op.isReg() && Op.isImplicit())) 11177 MIB.addOperand(Op); 11178 } 11179 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 11180 .addReg(X86::XMM0); 11181 11182 MI->eraseFromParent(); 11183 return BB; 11184} 11185 11186MachineBasicBlock * 11187X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 11188 DebugLoc dl = MI->getDebugLoc(); 11189 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11190 11191 // Address into RAX/EAX, other two args into ECX, EDX. 11192 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 11193 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 11194 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 11195 for (int i = 0; i < X86::AddrNumOperands; ++i) 11196 MIB.addOperand(MI->getOperand(i)); 11197 11198 unsigned ValOps = X86::AddrNumOperands; 11199 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11200 .addReg(MI->getOperand(ValOps).getReg()); 11201 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 11202 .addReg(MI->getOperand(ValOps+1).getReg()); 11203 11204 // The instruction doesn't actually take any operands though. 11205 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 11206 11207 MI->eraseFromParent(); // The pseudo is gone now. 11208 return BB; 11209} 11210 11211MachineBasicBlock * 11212X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 11213 DebugLoc dl = MI->getDebugLoc(); 11214 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11215 11216 // First arg in ECX, the second in EAX. 11217 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11218 .addReg(MI->getOperand(0).getReg()); 11219 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 11220 .addReg(MI->getOperand(1).getReg()); 11221 11222 // The instruction doesn't actually take any operands though. 11223 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 11224 11225 MI->eraseFromParent(); // The pseudo is gone now. 11226 return BB; 11227} 11228 11229MachineBasicBlock * 11230X86TargetLowering::EmitVAARG64WithCustomInserter( 11231 MachineInstr *MI, 11232 MachineBasicBlock *MBB) const { 11233 // Emit va_arg instruction on X86-64. 11234 11235 // Operands to this pseudo-instruction: 11236 // 0 ) Output : destination address (reg) 11237 // 1-5) Input : va_list address (addr, i64mem) 11238 // 6 ) ArgSize : Size (in bytes) of vararg type 11239 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 11240 // 8 ) Align : Alignment of type 11241 // 9 ) EFLAGS (implicit-def) 11242 11243 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 11244 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 11245 11246 unsigned DestReg = MI->getOperand(0).getReg(); 11247 MachineOperand &Base = MI->getOperand(1); 11248 MachineOperand &Scale = MI->getOperand(2); 11249 MachineOperand &Index = MI->getOperand(3); 11250 MachineOperand &Disp = MI->getOperand(4); 11251 MachineOperand &Segment = MI->getOperand(5); 11252 unsigned ArgSize = MI->getOperand(6).getImm(); 11253 unsigned ArgMode = MI->getOperand(7).getImm(); 11254 unsigned Align = MI->getOperand(8).getImm(); 11255 11256 // Memory Reference 11257 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 11258 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 11259 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 11260 11261 // Machine Information 11262 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11263 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11264 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 11265 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 11266 DebugLoc DL = MI->getDebugLoc(); 11267 11268 // struct va_list { 11269 // i32 gp_offset 11270 // i32 fp_offset 11271 // i64 overflow_area (address) 11272 // i64 reg_save_area (address) 11273 // } 11274 // sizeof(va_list) = 24 11275 // alignment(va_list) = 8 11276 11277 unsigned TotalNumIntRegs = 6; 11278 unsigned TotalNumXMMRegs = 8; 11279 bool UseGPOffset = (ArgMode == 1); 11280 bool UseFPOffset = (ArgMode == 2); 11281 unsigned MaxOffset = TotalNumIntRegs * 8 + 11282 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 11283 11284 /* Align ArgSize to a multiple of 8 */ 11285 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 11286 bool NeedsAlign = (Align > 8); 11287 11288 MachineBasicBlock *thisMBB = MBB; 11289 MachineBasicBlock *overflowMBB; 11290 MachineBasicBlock *offsetMBB; 11291 MachineBasicBlock *endMBB; 11292 11293 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 11294 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 11295 unsigned OffsetReg = 0; 11296 11297 if (!UseGPOffset && !UseFPOffset) { 11298 // If we only pull from the overflow region, we don't create a branch. 11299 // We don't need to alter control flow. 11300 OffsetDestReg = 0; // unused 11301 OverflowDestReg = DestReg; 11302 11303 offsetMBB = NULL; 11304 overflowMBB = thisMBB; 11305 endMBB = thisMBB; 11306 } else { 11307 // First emit code to check if gp_offset (or fp_offset) is below the bound. 11308 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 11309 // If not, pull from overflow_area. (branch to overflowMBB) 11310 // 11311 // thisMBB 11312 // | . 11313 // | . 11314 // offsetMBB overflowMBB 11315 // | . 11316 // | . 11317 // endMBB 11318 11319 // Registers for the PHI in endMBB 11320 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 11321 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 11322 11323 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11324 MachineFunction *MF = MBB->getParent(); 11325 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11326 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11327 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11328 11329 MachineFunction::iterator MBBIter = MBB; 11330 ++MBBIter; 11331 11332 // Insert the new basic blocks 11333 MF->insert(MBBIter, offsetMBB); 11334 MF->insert(MBBIter, overflowMBB); 11335 MF->insert(MBBIter, endMBB); 11336 11337 // Transfer the remainder of MBB and its successor edges to endMBB. 11338 endMBB->splice(endMBB->begin(), thisMBB, 11339 llvm::next(MachineBasicBlock::iterator(MI)), 11340 thisMBB->end()); 11341 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11342 11343 // Make offsetMBB and overflowMBB successors of thisMBB 11344 thisMBB->addSuccessor(offsetMBB); 11345 thisMBB->addSuccessor(overflowMBB); 11346 11347 // endMBB is a successor of both offsetMBB and overflowMBB 11348 offsetMBB->addSuccessor(endMBB); 11349 overflowMBB->addSuccessor(endMBB); 11350 11351 // Load the offset value into a register 11352 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11353 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 11354 .addOperand(Base) 11355 .addOperand(Scale) 11356 .addOperand(Index) 11357 .addDisp(Disp, UseFPOffset ? 4 : 0) 11358 .addOperand(Segment) 11359 .setMemRefs(MMOBegin, MMOEnd); 11360 11361 // Check if there is enough room left to pull this argument. 11362 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 11363 .addReg(OffsetReg) 11364 .addImm(MaxOffset + 8 - ArgSizeA8); 11365 11366 // Branch to "overflowMBB" if offset >= max 11367 // Fall through to "offsetMBB" otherwise 11368 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 11369 .addMBB(overflowMBB); 11370 } 11371 11372 // In offsetMBB, emit code to use the reg_save_area. 11373 if (offsetMBB) { 11374 assert(OffsetReg != 0); 11375 11376 // Read the reg_save_area address. 11377 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 11378 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 11379 .addOperand(Base) 11380 .addOperand(Scale) 11381 .addOperand(Index) 11382 .addDisp(Disp, 16) 11383 .addOperand(Segment) 11384 .setMemRefs(MMOBegin, MMOEnd); 11385 11386 // Zero-extend the offset 11387 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 11388 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 11389 .addImm(0) 11390 .addReg(OffsetReg) 11391 .addImm(X86::sub_32bit); 11392 11393 // Add the offset to the reg_save_area to get the final address. 11394 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 11395 .addReg(OffsetReg64) 11396 .addReg(RegSaveReg); 11397 11398 // Compute the offset for the next argument 11399 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11400 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 11401 .addReg(OffsetReg) 11402 .addImm(UseFPOffset ? 16 : 8); 11403 11404 // Store it back into the va_list. 11405 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 11406 .addOperand(Base) 11407 .addOperand(Scale) 11408 .addOperand(Index) 11409 .addDisp(Disp, UseFPOffset ? 4 : 0) 11410 .addOperand(Segment) 11411 .addReg(NextOffsetReg) 11412 .setMemRefs(MMOBegin, MMOEnd); 11413 11414 // Jump to endMBB 11415 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 11416 .addMBB(endMBB); 11417 } 11418 11419 // 11420 // Emit code to use overflow area 11421 // 11422 11423 // Load the overflow_area address into a register. 11424 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 11425 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 11426 .addOperand(Base) 11427 .addOperand(Scale) 11428 .addOperand(Index) 11429 .addDisp(Disp, 8) 11430 .addOperand(Segment) 11431 .setMemRefs(MMOBegin, MMOEnd); 11432 11433 // If we need to align it, do so. Otherwise, just copy the address 11434 // to OverflowDestReg. 11435 if (NeedsAlign) { 11436 // Align the overflow address 11437 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 11438 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 11439 11440 // aligned_addr = (addr + (align-1)) & ~(align-1) 11441 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 11442 .addReg(OverflowAddrReg) 11443 .addImm(Align-1); 11444 11445 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 11446 .addReg(TmpReg) 11447 .addImm(~(uint64_t)(Align-1)); 11448 } else { 11449 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 11450 .addReg(OverflowAddrReg); 11451 } 11452 11453 // Compute the next overflow address after this argument. 11454 // (the overflow address should be kept 8-byte aligned) 11455 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 11456 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 11457 .addReg(OverflowDestReg) 11458 .addImm(ArgSizeA8); 11459 11460 // Store the new overflow address. 11461 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 11462 .addOperand(Base) 11463 .addOperand(Scale) 11464 .addOperand(Index) 11465 .addDisp(Disp, 8) 11466 .addOperand(Segment) 11467 .addReg(NextAddrReg) 11468 .setMemRefs(MMOBegin, MMOEnd); 11469 11470 // If we branched, emit the PHI to the front of endMBB. 11471 if (offsetMBB) { 11472 BuildMI(*endMBB, endMBB->begin(), DL, 11473 TII->get(X86::PHI), DestReg) 11474 .addReg(OffsetDestReg).addMBB(offsetMBB) 11475 .addReg(OverflowDestReg).addMBB(overflowMBB); 11476 } 11477 11478 // Erase the pseudo instruction 11479 MI->eraseFromParent(); 11480 11481 return endMBB; 11482} 11483 11484MachineBasicBlock * 11485X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 11486 MachineInstr *MI, 11487 MachineBasicBlock *MBB) const { 11488 // Emit code to save XMM registers to the stack. The ABI says that the 11489 // number of registers to save is given in %al, so it's theoretically 11490 // possible to do an indirect jump trick to avoid saving all of them, 11491 // however this code takes a simpler approach and just executes all 11492 // of the stores if %al is non-zero. It's less code, and it's probably 11493 // easier on the hardware branch predictor, and stores aren't all that 11494 // expensive anyway. 11495 11496 // Create the new basic blocks. One block contains all the XMM stores, 11497 // and one block is the final destination regardless of whether any 11498 // stores were performed. 11499 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11500 MachineFunction *F = MBB->getParent(); 11501 MachineFunction::iterator MBBIter = MBB; 11502 ++MBBIter; 11503 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 11504 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 11505 F->insert(MBBIter, XMMSaveMBB); 11506 F->insert(MBBIter, EndMBB); 11507 11508 // Transfer the remainder of MBB and its successor edges to EndMBB. 11509 EndMBB->splice(EndMBB->begin(), MBB, 11510 llvm::next(MachineBasicBlock::iterator(MI)), 11511 MBB->end()); 11512 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 11513 11514 // The original block will now fall through to the XMM save block. 11515 MBB->addSuccessor(XMMSaveMBB); 11516 // The XMMSaveMBB will fall through to the end block. 11517 XMMSaveMBB->addSuccessor(EndMBB); 11518 11519 // Now add the instructions. 11520 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11521 DebugLoc DL = MI->getDebugLoc(); 11522 11523 unsigned CountReg = MI->getOperand(0).getReg(); 11524 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 11525 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 11526 11527 if (!Subtarget->isTargetWin64()) { 11528 // If %al is 0, branch around the XMM save block. 11529 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 11530 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 11531 MBB->addSuccessor(EndMBB); 11532 } 11533 11534 // In the XMM save block, save all the XMM argument registers. 11535 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 11536 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 11537 MachineMemOperand *MMO = 11538 F->getMachineMemOperand( 11539 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 11540 MachineMemOperand::MOStore, 11541 /*Size=*/16, /*Align=*/16); 11542 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 11543 .addFrameIndex(RegSaveFrameIndex) 11544 .addImm(/*Scale=*/1) 11545 .addReg(/*IndexReg=*/0) 11546 .addImm(/*Disp=*/Offset) 11547 .addReg(/*Segment=*/0) 11548 .addReg(MI->getOperand(i).getReg()) 11549 .addMemOperand(MMO); 11550 } 11551 11552 MI->eraseFromParent(); // The pseudo instruction is gone now. 11553 11554 return EndMBB; 11555} 11556 11557MachineBasicBlock * 11558X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 11559 MachineBasicBlock *BB) const { 11560 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11561 DebugLoc DL = MI->getDebugLoc(); 11562 11563 // To "insert" a SELECT_CC instruction, we actually have to insert the 11564 // diamond control-flow pattern. The incoming instruction knows the 11565 // destination vreg to set, the condition code register to branch on, the 11566 // true/false values to select between, and a branch opcode to use. 11567 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11568 MachineFunction::iterator It = BB; 11569 ++It; 11570 11571 // thisMBB: 11572 // ... 11573 // TrueVal = ... 11574 // cmpTY ccX, r1, r2 11575 // bCC copy1MBB 11576 // fallthrough --> copy0MBB 11577 MachineBasicBlock *thisMBB = BB; 11578 MachineFunction *F = BB->getParent(); 11579 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11580 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11581 F->insert(It, copy0MBB); 11582 F->insert(It, sinkMBB); 11583 11584 // If the EFLAGS register isn't dead in the terminator, then claim that it's 11585 // live into the sink and copy blocks. 11586 const MachineFunction *MF = BB->getParent(); 11587 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 11588 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 11589 11590 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 11591 const MachineOperand &MO = MI->getOperand(I); 11592 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 11593 unsigned Reg = MO.getReg(); 11594 if (Reg != X86::EFLAGS) continue; 11595 copy0MBB->addLiveIn(Reg); 11596 sinkMBB->addLiveIn(Reg); 11597 } 11598 11599 // Transfer the remainder of BB and its successor edges to sinkMBB. 11600 sinkMBB->splice(sinkMBB->begin(), BB, 11601 llvm::next(MachineBasicBlock::iterator(MI)), 11602 BB->end()); 11603 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11604 11605 // Add the true and fallthrough blocks as its successors. 11606 BB->addSuccessor(copy0MBB); 11607 BB->addSuccessor(sinkMBB); 11608 11609 // Create the conditional branch instruction. 11610 unsigned Opc = 11611 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 11612 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 11613 11614 // copy0MBB: 11615 // %FalseValue = ... 11616 // # fallthrough to sinkMBB 11617 copy0MBB->addSuccessor(sinkMBB); 11618 11619 // sinkMBB: 11620 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11621 // ... 11622 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 11623 TII->get(X86::PHI), MI->getOperand(0).getReg()) 11624 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 11625 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 11626 11627 MI->eraseFromParent(); // The pseudo instruction is gone now. 11628 return sinkMBB; 11629} 11630 11631MachineBasicBlock * 11632X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 11633 MachineBasicBlock *BB) const { 11634 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11635 DebugLoc DL = MI->getDebugLoc(); 11636 11637 assert(!Subtarget->isTargetEnvMacho()); 11638 11639 // The lowering is pretty easy: we're just emitting the call to _alloca. The 11640 // non-trivial part is impdef of ESP. 11641 11642 if (Subtarget->isTargetWin64()) { 11643 if (Subtarget->isTargetCygMing()) { 11644 // ___chkstk(Mingw64): 11645 // Clobbers R10, R11, RAX and EFLAGS. 11646 // Updates RSP. 11647 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11648 .addExternalSymbol("___chkstk") 11649 .addReg(X86::RAX, RegState::Implicit) 11650 .addReg(X86::RSP, RegState::Implicit) 11651 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 11652 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 11653 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11654 } else { 11655 // __chkstk(MSVCRT): does not update stack pointer. 11656 // Clobbers R10, R11 and EFLAGS. 11657 // FIXME: RAX(allocated size) might be reused and not killed. 11658 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11659 .addExternalSymbol("__chkstk") 11660 .addReg(X86::RAX, RegState::Implicit) 11661 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11662 // RAX has the offset to subtracted from RSP. 11663 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 11664 .addReg(X86::RSP) 11665 .addReg(X86::RAX); 11666 } 11667 } else { 11668 const char *StackProbeSymbol = 11669 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 11670 11671 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 11672 .addExternalSymbol(StackProbeSymbol) 11673 .addReg(X86::EAX, RegState::Implicit) 11674 .addReg(X86::ESP, RegState::Implicit) 11675 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 11676 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 11677 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11678 } 11679 11680 MI->eraseFromParent(); // The pseudo instruction is gone now. 11681 return BB; 11682} 11683 11684MachineBasicBlock * 11685X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 11686 MachineBasicBlock *BB) const { 11687 // This is pretty easy. We're taking the value that we received from 11688 // our load from the relocation, sticking it in either RDI (x86-64) 11689 // or EAX and doing an indirect call. The return value will then 11690 // be in the normal return register. 11691 const X86InstrInfo *TII 11692 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 11693 DebugLoc DL = MI->getDebugLoc(); 11694 MachineFunction *F = BB->getParent(); 11695 11696 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 11697 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 11698 11699 if (Subtarget->is64Bit()) { 11700 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11701 TII->get(X86::MOV64rm), X86::RDI) 11702 .addReg(X86::RIP) 11703 .addImm(0).addReg(0) 11704 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11705 MI->getOperand(3).getTargetFlags()) 11706 .addReg(0); 11707 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 11708 addDirectMem(MIB, X86::RDI); 11709 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 11710 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11711 TII->get(X86::MOV32rm), X86::EAX) 11712 .addReg(0) 11713 .addImm(0).addReg(0) 11714 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11715 MI->getOperand(3).getTargetFlags()) 11716 .addReg(0); 11717 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11718 addDirectMem(MIB, X86::EAX); 11719 } else { 11720 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11721 TII->get(X86::MOV32rm), X86::EAX) 11722 .addReg(TII->getGlobalBaseReg(F)) 11723 .addImm(0).addReg(0) 11724 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11725 MI->getOperand(3).getTargetFlags()) 11726 .addReg(0); 11727 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11728 addDirectMem(MIB, X86::EAX); 11729 } 11730 11731 MI->eraseFromParent(); // The pseudo instruction is gone now. 11732 return BB; 11733} 11734 11735MachineBasicBlock * 11736X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 11737 MachineBasicBlock *BB) const { 11738 switch (MI->getOpcode()) { 11739 default: assert(false && "Unexpected instr type to insert"); 11740 case X86::TAILJMPd64: 11741 case X86::TAILJMPr64: 11742 case X86::TAILJMPm64: 11743 assert(!"TAILJMP64 would not be touched here."); 11744 case X86::TCRETURNdi64: 11745 case X86::TCRETURNri64: 11746 case X86::TCRETURNmi64: 11747 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 11748 // On AMD64, additional defs should be added before register allocation. 11749 if (!Subtarget->isTargetWin64()) { 11750 MI->addRegisterDefined(X86::RSI); 11751 MI->addRegisterDefined(X86::RDI); 11752 MI->addRegisterDefined(X86::XMM6); 11753 MI->addRegisterDefined(X86::XMM7); 11754 MI->addRegisterDefined(X86::XMM8); 11755 MI->addRegisterDefined(X86::XMM9); 11756 MI->addRegisterDefined(X86::XMM10); 11757 MI->addRegisterDefined(X86::XMM11); 11758 MI->addRegisterDefined(X86::XMM12); 11759 MI->addRegisterDefined(X86::XMM13); 11760 MI->addRegisterDefined(X86::XMM14); 11761 MI->addRegisterDefined(X86::XMM15); 11762 } 11763 return BB; 11764 case X86::WIN_ALLOCA: 11765 return EmitLoweredWinAlloca(MI, BB); 11766 case X86::TLSCall_32: 11767 case X86::TLSCall_64: 11768 return EmitLoweredTLSCall(MI, BB); 11769 case X86::CMOV_GR8: 11770 case X86::CMOV_FR32: 11771 case X86::CMOV_FR64: 11772 case X86::CMOV_V4F32: 11773 case X86::CMOV_V2F64: 11774 case X86::CMOV_V2I64: 11775 case X86::CMOV_V8F32: 11776 case X86::CMOV_V4F64: 11777 case X86::CMOV_V4I64: 11778 case X86::CMOV_GR16: 11779 case X86::CMOV_GR32: 11780 case X86::CMOV_RFP32: 11781 case X86::CMOV_RFP64: 11782 case X86::CMOV_RFP80: 11783 return EmitLoweredSelect(MI, BB); 11784 11785 case X86::FP32_TO_INT16_IN_MEM: 11786 case X86::FP32_TO_INT32_IN_MEM: 11787 case X86::FP32_TO_INT64_IN_MEM: 11788 case X86::FP64_TO_INT16_IN_MEM: 11789 case X86::FP64_TO_INT32_IN_MEM: 11790 case X86::FP64_TO_INT64_IN_MEM: 11791 case X86::FP80_TO_INT16_IN_MEM: 11792 case X86::FP80_TO_INT32_IN_MEM: 11793 case X86::FP80_TO_INT64_IN_MEM: { 11794 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11795 DebugLoc DL = MI->getDebugLoc(); 11796 11797 // Change the floating point control register to use "round towards zero" 11798 // mode when truncating to an integer value. 11799 MachineFunction *F = BB->getParent(); 11800 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 11801 addFrameReference(BuildMI(*BB, MI, DL, 11802 TII->get(X86::FNSTCW16m)), CWFrameIdx); 11803 11804 // Load the old value of the high byte of the control word... 11805 unsigned OldCW = 11806 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 11807 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 11808 CWFrameIdx); 11809 11810 // Set the high part to be round to zero... 11811 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 11812 .addImm(0xC7F); 11813 11814 // Reload the modified control word now... 11815 addFrameReference(BuildMI(*BB, MI, DL, 11816 TII->get(X86::FLDCW16m)), CWFrameIdx); 11817 11818 // Restore the memory image of control word to original value 11819 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 11820 .addReg(OldCW); 11821 11822 // Get the X86 opcode to use. 11823 unsigned Opc; 11824 switch (MI->getOpcode()) { 11825 default: llvm_unreachable("illegal opcode!"); 11826 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 11827 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 11828 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 11829 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 11830 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 11831 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 11832 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 11833 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 11834 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 11835 } 11836 11837 X86AddressMode AM; 11838 MachineOperand &Op = MI->getOperand(0); 11839 if (Op.isReg()) { 11840 AM.BaseType = X86AddressMode::RegBase; 11841 AM.Base.Reg = Op.getReg(); 11842 } else { 11843 AM.BaseType = X86AddressMode::FrameIndexBase; 11844 AM.Base.FrameIndex = Op.getIndex(); 11845 } 11846 Op = MI->getOperand(1); 11847 if (Op.isImm()) 11848 AM.Scale = Op.getImm(); 11849 Op = MI->getOperand(2); 11850 if (Op.isImm()) 11851 AM.IndexReg = Op.getImm(); 11852 Op = MI->getOperand(3); 11853 if (Op.isGlobal()) { 11854 AM.GV = Op.getGlobal(); 11855 } else { 11856 AM.Disp = Op.getImm(); 11857 } 11858 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 11859 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 11860 11861 // Reload the original control word now. 11862 addFrameReference(BuildMI(*BB, MI, DL, 11863 TII->get(X86::FLDCW16m)), CWFrameIdx); 11864 11865 MI->eraseFromParent(); // The pseudo instruction is gone now. 11866 return BB; 11867 } 11868 // String/text processing lowering. 11869 case X86::PCMPISTRM128REG: 11870 case X86::VPCMPISTRM128REG: 11871 return EmitPCMP(MI, BB, 3, false /* in-mem */); 11872 case X86::PCMPISTRM128MEM: 11873 case X86::VPCMPISTRM128MEM: 11874 return EmitPCMP(MI, BB, 3, true /* in-mem */); 11875 case X86::PCMPESTRM128REG: 11876 case X86::VPCMPESTRM128REG: 11877 return EmitPCMP(MI, BB, 5, false /* in mem */); 11878 case X86::PCMPESTRM128MEM: 11879 case X86::VPCMPESTRM128MEM: 11880 return EmitPCMP(MI, BB, 5, true /* in mem */); 11881 11882 // Thread synchronization. 11883 case X86::MONITOR: 11884 return EmitMonitor(MI, BB); 11885 case X86::MWAIT: 11886 return EmitMwait(MI, BB); 11887 11888 // Atomic Lowering. 11889 case X86::ATOMAND32: 11890 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11891 X86::AND32ri, X86::MOV32rm, 11892 X86::LCMPXCHG32, 11893 X86::NOT32r, X86::EAX, 11894 X86::GR32RegisterClass); 11895 case X86::ATOMOR32: 11896 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 11897 X86::OR32ri, X86::MOV32rm, 11898 X86::LCMPXCHG32, 11899 X86::NOT32r, X86::EAX, 11900 X86::GR32RegisterClass); 11901 case X86::ATOMXOR32: 11902 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 11903 X86::XOR32ri, X86::MOV32rm, 11904 X86::LCMPXCHG32, 11905 X86::NOT32r, X86::EAX, 11906 X86::GR32RegisterClass); 11907 case X86::ATOMNAND32: 11908 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11909 X86::AND32ri, X86::MOV32rm, 11910 X86::LCMPXCHG32, 11911 X86::NOT32r, X86::EAX, 11912 X86::GR32RegisterClass, true); 11913 case X86::ATOMMIN32: 11914 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 11915 case X86::ATOMMAX32: 11916 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 11917 case X86::ATOMUMIN32: 11918 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 11919 case X86::ATOMUMAX32: 11920 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 11921 11922 case X86::ATOMAND16: 11923 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11924 X86::AND16ri, X86::MOV16rm, 11925 X86::LCMPXCHG16, 11926 X86::NOT16r, X86::AX, 11927 X86::GR16RegisterClass); 11928 case X86::ATOMOR16: 11929 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 11930 X86::OR16ri, X86::MOV16rm, 11931 X86::LCMPXCHG16, 11932 X86::NOT16r, X86::AX, 11933 X86::GR16RegisterClass); 11934 case X86::ATOMXOR16: 11935 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 11936 X86::XOR16ri, X86::MOV16rm, 11937 X86::LCMPXCHG16, 11938 X86::NOT16r, X86::AX, 11939 X86::GR16RegisterClass); 11940 case X86::ATOMNAND16: 11941 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11942 X86::AND16ri, X86::MOV16rm, 11943 X86::LCMPXCHG16, 11944 X86::NOT16r, X86::AX, 11945 X86::GR16RegisterClass, true); 11946 case X86::ATOMMIN16: 11947 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 11948 case X86::ATOMMAX16: 11949 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 11950 case X86::ATOMUMIN16: 11951 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 11952 case X86::ATOMUMAX16: 11953 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 11954 11955 case X86::ATOMAND8: 11956 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11957 X86::AND8ri, X86::MOV8rm, 11958 X86::LCMPXCHG8, 11959 X86::NOT8r, X86::AL, 11960 X86::GR8RegisterClass); 11961 case X86::ATOMOR8: 11962 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 11963 X86::OR8ri, X86::MOV8rm, 11964 X86::LCMPXCHG8, 11965 X86::NOT8r, X86::AL, 11966 X86::GR8RegisterClass); 11967 case X86::ATOMXOR8: 11968 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 11969 X86::XOR8ri, X86::MOV8rm, 11970 X86::LCMPXCHG8, 11971 X86::NOT8r, X86::AL, 11972 X86::GR8RegisterClass); 11973 case X86::ATOMNAND8: 11974 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11975 X86::AND8ri, X86::MOV8rm, 11976 X86::LCMPXCHG8, 11977 X86::NOT8r, X86::AL, 11978 X86::GR8RegisterClass, true); 11979 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 11980 // This group is for 64-bit host. 11981 case X86::ATOMAND64: 11982 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11983 X86::AND64ri32, X86::MOV64rm, 11984 X86::LCMPXCHG64, 11985 X86::NOT64r, X86::RAX, 11986 X86::GR64RegisterClass); 11987 case X86::ATOMOR64: 11988 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 11989 X86::OR64ri32, X86::MOV64rm, 11990 X86::LCMPXCHG64, 11991 X86::NOT64r, X86::RAX, 11992 X86::GR64RegisterClass); 11993 case X86::ATOMXOR64: 11994 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 11995 X86::XOR64ri32, X86::MOV64rm, 11996 X86::LCMPXCHG64, 11997 X86::NOT64r, X86::RAX, 11998 X86::GR64RegisterClass); 11999 case X86::ATOMNAND64: 12000 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 12001 X86::AND64ri32, X86::MOV64rm, 12002 X86::LCMPXCHG64, 12003 X86::NOT64r, X86::RAX, 12004 X86::GR64RegisterClass, true); 12005 case X86::ATOMMIN64: 12006 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 12007 case X86::ATOMMAX64: 12008 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 12009 case X86::ATOMUMIN64: 12010 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 12011 case X86::ATOMUMAX64: 12012 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 12013 12014 // This group does 64-bit operations on a 32-bit host. 12015 case X86::ATOMAND6432: 12016 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12017 X86::AND32rr, X86::AND32rr, 12018 X86::AND32ri, X86::AND32ri, 12019 false); 12020 case X86::ATOMOR6432: 12021 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12022 X86::OR32rr, X86::OR32rr, 12023 X86::OR32ri, X86::OR32ri, 12024 false); 12025 case X86::ATOMXOR6432: 12026 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12027 X86::XOR32rr, X86::XOR32rr, 12028 X86::XOR32ri, X86::XOR32ri, 12029 false); 12030 case X86::ATOMNAND6432: 12031 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12032 X86::AND32rr, X86::AND32rr, 12033 X86::AND32ri, X86::AND32ri, 12034 true); 12035 case X86::ATOMADD6432: 12036 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12037 X86::ADD32rr, X86::ADC32rr, 12038 X86::ADD32ri, X86::ADC32ri, 12039 false); 12040 case X86::ATOMSUB6432: 12041 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12042 X86::SUB32rr, X86::SBB32rr, 12043 X86::SUB32ri, X86::SBB32ri, 12044 false); 12045 case X86::ATOMSWAP6432: 12046 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12047 X86::MOV32rr, X86::MOV32rr, 12048 X86::MOV32ri, X86::MOV32ri, 12049 false); 12050 case X86::VASTART_SAVE_XMM_REGS: 12051 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 12052 12053 case X86::VAARG_64: 12054 return EmitVAARG64WithCustomInserter(MI, BB); 12055 } 12056} 12057 12058//===----------------------------------------------------------------------===// 12059// X86 Optimization Hooks 12060//===----------------------------------------------------------------------===// 12061 12062void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 12063 const APInt &Mask, 12064 APInt &KnownZero, 12065 APInt &KnownOne, 12066 const SelectionDAG &DAG, 12067 unsigned Depth) const { 12068 unsigned Opc = Op.getOpcode(); 12069 assert((Opc >= ISD::BUILTIN_OP_END || 12070 Opc == ISD::INTRINSIC_WO_CHAIN || 12071 Opc == ISD::INTRINSIC_W_CHAIN || 12072 Opc == ISD::INTRINSIC_VOID) && 12073 "Should use MaskedValueIsZero if you don't know whether Op" 12074 " is a target node!"); 12075 12076 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 12077 switch (Opc) { 12078 default: break; 12079 case X86ISD::ADD: 12080 case X86ISD::SUB: 12081 case X86ISD::ADC: 12082 case X86ISD::SBB: 12083 case X86ISD::SMUL: 12084 case X86ISD::UMUL: 12085 case X86ISD::INC: 12086 case X86ISD::DEC: 12087 case X86ISD::OR: 12088 case X86ISD::XOR: 12089 case X86ISD::AND: 12090 // These nodes' second result is a boolean. 12091 if (Op.getResNo() == 0) 12092 break; 12093 // Fallthrough 12094 case X86ISD::SETCC: 12095 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 12096 Mask.getBitWidth() - 1); 12097 break; 12098 } 12099} 12100 12101unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 12102 unsigned Depth) const { 12103 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 12104 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 12105 return Op.getValueType().getScalarType().getSizeInBits(); 12106 12107 // Fallback case. 12108 return 1; 12109} 12110 12111/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 12112/// node is a GlobalAddress + offset. 12113bool X86TargetLowering::isGAPlusOffset(SDNode *N, 12114 const GlobalValue* &GA, 12115 int64_t &Offset) const { 12116 if (N->getOpcode() == X86ISD::Wrapper) { 12117 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 12118 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 12119 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 12120 return true; 12121 } 12122 } 12123 return TargetLowering::isGAPlusOffset(N, GA, Offset); 12124} 12125 12126/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 12127/// same as extracting the high 128-bit part of 256-bit vector and then 12128/// inserting the result into the low part of a new 256-bit vector 12129static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 12130 EVT VT = SVOp->getValueType(0); 12131 int NumElems = VT.getVectorNumElements(); 12132 12133 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12134 for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) 12135 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12136 SVOp->getMaskElt(j) >= 0) 12137 return false; 12138 12139 return true; 12140} 12141 12142/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 12143/// same as extracting the low 128-bit part of 256-bit vector and then 12144/// inserting the result into the high part of a new 256-bit vector 12145static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 12146 EVT VT = SVOp->getValueType(0); 12147 int NumElems = VT.getVectorNumElements(); 12148 12149 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12150 for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) 12151 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12152 SVOp->getMaskElt(j) >= 0) 12153 return false; 12154 12155 return true; 12156} 12157 12158/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 12159static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 12160 TargetLowering::DAGCombinerInfo &DCI) { 12161 DebugLoc dl = N->getDebugLoc(); 12162 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 12163 SDValue V1 = SVOp->getOperand(0); 12164 SDValue V2 = SVOp->getOperand(1); 12165 EVT VT = SVOp->getValueType(0); 12166 int NumElems = VT.getVectorNumElements(); 12167 12168 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 12169 V2.getOpcode() == ISD::CONCAT_VECTORS) { 12170 // 12171 // 0,0,0,... 12172 // | 12173 // V UNDEF BUILD_VECTOR UNDEF 12174 // \ / \ / 12175 // CONCAT_VECTOR CONCAT_VECTOR 12176 // \ / 12177 // \ / 12178 // RESULT: V + zero extended 12179 // 12180 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 12181 V2.getOperand(1).getOpcode() != ISD::UNDEF || 12182 V1.getOperand(1).getOpcode() != ISD::UNDEF) 12183 return SDValue(); 12184 12185 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 12186 return SDValue(); 12187 12188 // To match the shuffle mask, the first half of the mask should 12189 // be exactly the first vector, and all the rest a splat with the 12190 // first element of the second one. 12191 for (int i = 0; i < NumElems/2; ++i) 12192 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 12193 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 12194 return SDValue(); 12195 12196 // Emit a zeroed vector and insert the desired subvector on its 12197 // first half. 12198 SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); 12199 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 12200 DAG.getConstant(0, MVT::i32), DAG, dl); 12201 return DCI.CombineTo(N, InsV); 12202 } 12203 12204 //===--------------------------------------------------------------------===// 12205 // Combine some shuffles into subvector extracts and inserts: 12206 // 12207 12208 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12209 if (isShuffleHigh128VectorInsertLow(SVOp)) { 12210 SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), 12211 DAG, dl); 12212 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12213 V, DAG.getConstant(0, MVT::i32), DAG, dl); 12214 return DCI.CombineTo(N, InsV); 12215 } 12216 12217 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12218 if (isShuffleLow128VectorInsertHigh(SVOp)) { 12219 SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); 12220 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12221 V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 12222 return DCI.CombineTo(N, InsV); 12223 } 12224 12225 return SDValue(); 12226} 12227 12228/// PerformShuffleCombine - Performs several different shuffle combines. 12229static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 12230 TargetLowering::DAGCombinerInfo &DCI, 12231 const X86Subtarget *Subtarget) { 12232 DebugLoc dl = N->getDebugLoc(); 12233 EVT VT = N->getValueType(0); 12234 12235 // Don't create instructions with illegal types after legalize types has run. 12236 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12237 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 12238 return SDValue(); 12239 12240 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 12241 if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && 12242 N->getOpcode() == ISD::VECTOR_SHUFFLE) 12243 return PerformShuffleCombine256(N, DAG, DCI); 12244 12245 // Only handle 128 wide vector from here on. 12246 if (VT.getSizeInBits() != 128) 12247 return SDValue(); 12248 12249 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 12250 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 12251 // consecutive, non-overlapping, and in the right order. 12252 SmallVector<SDValue, 16> Elts; 12253 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 12254 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 12255 12256 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 12257} 12258 12259/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 12260/// generation and convert it from being a bunch of shuffles and extracts 12261/// to a simple store and scalar loads to extract the elements. 12262static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 12263 const TargetLowering &TLI) { 12264 SDValue InputVector = N->getOperand(0); 12265 12266 // Only operate on vectors of 4 elements, where the alternative shuffling 12267 // gets to be more expensive. 12268 if (InputVector.getValueType() != MVT::v4i32) 12269 return SDValue(); 12270 12271 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 12272 // single use which is a sign-extend or zero-extend, and all elements are 12273 // used. 12274 SmallVector<SDNode *, 4> Uses; 12275 unsigned ExtractedElements = 0; 12276 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 12277 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 12278 if (UI.getUse().getResNo() != InputVector.getResNo()) 12279 return SDValue(); 12280 12281 SDNode *Extract = *UI; 12282 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12283 return SDValue(); 12284 12285 if (Extract->getValueType(0) != MVT::i32) 12286 return SDValue(); 12287 if (!Extract->hasOneUse()) 12288 return SDValue(); 12289 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 12290 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 12291 return SDValue(); 12292 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 12293 return SDValue(); 12294 12295 // Record which element was extracted. 12296 ExtractedElements |= 12297 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 12298 12299 Uses.push_back(Extract); 12300 } 12301 12302 // If not all the elements were used, this may not be worthwhile. 12303 if (ExtractedElements != 15) 12304 return SDValue(); 12305 12306 // Ok, we've now decided to do the transformation. 12307 DebugLoc dl = InputVector.getDebugLoc(); 12308 12309 // Store the value to a temporary stack slot. 12310 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 12311 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 12312 MachinePointerInfo(), false, false, 0); 12313 12314 // Replace each use (extract) with a load of the appropriate element. 12315 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 12316 UE = Uses.end(); UI != UE; ++UI) { 12317 SDNode *Extract = *UI; 12318 12319 // cOMpute the element's address. 12320 SDValue Idx = Extract->getOperand(1); 12321 unsigned EltSize = 12322 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 12323 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 12324 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 12325 12326 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 12327 StackPtr, OffsetVal); 12328 12329 // Load the scalar. 12330 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 12331 ScalarAddr, MachinePointerInfo(), 12332 false, false, 0); 12333 12334 // Replace the exact with the load. 12335 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 12336 } 12337 12338 // The replacement was made in place; don't return anything. 12339 return SDValue(); 12340} 12341 12342/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 12343static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 12344 const X86Subtarget *Subtarget) { 12345 DebugLoc DL = N->getDebugLoc(); 12346 SDValue Cond = N->getOperand(0); 12347 // Get the LHS/RHS of the select. 12348 SDValue LHS = N->getOperand(1); 12349 SDValue RHS = N->getOperand(2); 12350 12351 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 12352 // instructions match the semantics of the common C idiom x<y?x:y but not 12353 // x<=y?x:y, because of how they handle negative zero (which can be 12354 // ignored in unsafe-math mode). 12355 if (Subtarget->hasSSE2() && 12356 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 12357 Cond.getOpcode() == ISD::SETCC) { 12358 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 12359 12360 unsigned Opcode = 0; 12361 // Check for x CC y ? x : y. 12362 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 12363 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 12364 switch (CC) { 12365 default: break; 12366 case ISD::SETULT: 12367 // Converting this to a min would handle NaNs incorrectly, and swapping 12368 // the operands would cause it to handle comparisons between positive 12369 // and negative zero incorrectly. 12370 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 12371 if (!UnsafeFPMath && 12372 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 12373 break; 12374 std::swap(LHS, RHS); 12375 } 12376 Opcode = X86ISD::FMIN; 12377 break; 12378 case ISD::SETOLE: 12379 // Converting this to a min would handle comparisons between positive 12380 // and negative zero incorrectly. 12381 if (!UnsafeFPMath && 12382 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 12383 break; 12384 Opcode = X86ISD::FMIN; 12385 break; 12386 case ISD::SETULE: 12387 // Converting this to a min would handle both negative zeros and NaNs 12388 // incorrectly, but we can swap the operands to fix both. 12389 std::swap(LHS, RHS); 12390 case ISD::SETOLT: 12391 case ISD::SETLT: 12392 case ISD::SETLE: 12393 Opcode = X86ISD::FMIN; 12394 break; 12395 12396 case ISD::SETOGE: 12397 // Converting this to a max would handle comparisons between positive 12398 // and negative zero incorrectly. 12399 if (!UnsafeFPMath && 12400 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 12401 break; 12402 Opcode = X86ISD::FMAX; 12403 break; 12404 case ISD::SETUGT: 12405 // Converting this to a max would handle NaNs incorrectly, and swapping 12406 // the operands would cause it to handle comparisons between positive 12407 // and negative zero incorrectly. 12408 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 12409 if (!UnsafeFPMath && 12410 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 12411 break; 12412 std::swap(LHS, RHS); 12413 } 12414 Opcode = X86ISD::FMAX; 12415 break; 12416 case ISD::SETUGE: 12417 // Converting this to a max would handle both negative zeros and NaNs 12418 // incorrectly, but we can swap the operands to fix both. 12419 std::swap(LHS, RHS); 12420 case ISD::SETOGT: 12421 case ISD::SETGT: 12422 case ISD::SETGE: 12423 Opcode = X86ISD::FMAX; 12424 break; 12425 } 12426 // Check for x CC y ? y : x -- a min/max with reversed arms. 12427 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 12428 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 12429 switch (CC) { 12430 default: break; 12431 case ISD::SETOGE: 12432 // Converting this to a min would handle comparisons between positive 12433 // and negative zero incorrectly, and swapping the operands would 12434 // cause it to handle NaNs incorrectly. 12435 if (!UnsafeFPMath && 12436 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 12437 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12438 break; 12439 std::swap(LHS, RHS); 12440 } 12441 Opcode = X86ISD::FMIN; 12442 break; 12443 case ISD::SETUGT: 12444 // Converting this to a min would handle NaNs incorrectly. 12445 if (!UnsafeFPMath && 12446 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 12447 break; 12448 Opcode = X86ISD::FMIN; 12449 break; 12450 case ISD::SETUGE: 12451 // Converting this to a min would handle both negative zeros and NaNs 12452 // incorrectly, but we can swap the operands to fix both. 12453 std::swap(LHS, RHS); 12454 case ISD::SETOGT: 12455 case ISD::SETGT: 12456 case ISD::SETGE: 12457 Opcode = X86ISD::FMIN; 12458 break; 12459 12460 case ISD::SETULT: 12461 // Converting this to a max would handle NaNs incorrectly. 12462 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12463 break; 12464 Opcode = X86ISD::FMAX; 12465 break; 12466 case ISD::SETOLE: 12467 // Converting this to a max would handle comparisons between positive 12468 // and negative zero incorrectly, and swapping the operands would 12469 // cause it to handle NaNs incorrectly. 12470 if (!UnsafeFPMath && 12471 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 12472 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12473 break; 12474 std::swap(LHS, RHS); 12475 } 12476 Opcode = X86ISD::FMAX; 12477 break; 12478 case ISD::SETULE: 12479 // Converting this to a max would handle both negative zeros and NaNs 12480 // incorrectly, but we can swap the operands to fix both. 12481 std::swap(LHS, RHS); 12482 case ISD::SETOLT: 12483 case ISD::SETLT: 12484 case ISD::SETLE: 12485 Opcode = X86ISD::FMAX; 12486 break; 12487 } 12488 } 12489 12490 if (Opcode) 12491 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 12492 } 12493 12494 // If this is a select between two integer constants, try to do some 12495 // optimizations. 12496 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 12497 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 12498 // Don't do this for crazy integer types. 12499 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 12500 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 12501 // so that TrueC (the true value) is larger than FalseC. 12502 bool NeedsCondInvert = false; 12503 12504 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 12505 // Efficiently invertible. 12506 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 12507 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 12508 isa<ConstantSDNode>(Cond.getOperand(1))))) { 12509 NeedsCondInvert = true; 12510 std::swap(TrueC, FalseC); 12511 } 12512 12513 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 12514 if (FalseC->getAPIntValue() == 0 && 12515 TrueC->getAPIntValue().isPowerOf2()) { 12516 if (NeedsCondInvert) // Invert the condition if needed. 12517 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12518 DAG.getConstant(1, Cond.getValueType())); 12519 12520 // Zero extend the condition if needed. 12521 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 12522 12523 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 12524 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 12525 DAG.getConstant(ShAmt, MVT::i8)); 12526 } 12527 12528 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 12529 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 12530 if (NeedsCondInvert) // Invert the condition if needed. 12531 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12532 DAG.getConstant(1, Cond.getValueType())); 12533 12534 // Zero extend the condition if needed. 12535 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 12536 FalseC->getValueType(0), Cond); 12537 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12538 SDValue(FalseC, 0)); 12539 } 12540 12541 // Optimize cases that will turn into an LEA instruction. This requires 12542 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 12543 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 12544 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 12545 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 12546 12547 bool isFastMultiplier = false; 12548 if (Diff < 10) { 12549 switch ((unsigned char)Diff) { 12550 default: break; 12551 case 1: // result = add base, cond 12552 case 2: // result = lea base( , cond*2) 12553 case 3: // result = lea base(cond, cond*2) 12554 case 4: // result = lea base( , cond*4) 12555 case 5: // result = lea base(cond, cond*4) 12556 case 8: // result = lea base( , cond*8) 12557 case 9: // result = lea base(cond, cond*8) 12558 isFastMultiplier = true; 12559 break; 12560 } 12561 } 12562 12563 if (isFastMultiplier) { 12564 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 12565 if (NeedsCondInvert) // Invert the condition if needed. 12566 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12567 DAG.getConstant(1, Cond.getValueType())); 12568 12569 // Zero extend the condition if needed. 12570 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 12571 Cond); 12572 // Scale the condition by the difference. 12573 if (Diff != 1) 12574 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 12575 DAG.getConstant(Diff, Cond.getValueType())); 12576 12577 // Add the base if non-zero. 12578 if (FalseC->getAPIntValue() != 0) 12579 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12580 SDValue(FalseC, 0)); 12581 return Cond; 12582 } 12583 } 12584 } 12585 } 12586 12587 return SDValue(); 12588} 12589 12590/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 12591static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 12592 TargetLowering::DAGCombinerInfo &DCI) { 12593 DebugLoc DL = N->getDebugLoc(); 12594 12595 // If the flag operand isn't dead, don't touch this CMOV. 12596 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 12597 return SDValue(); 12598 12599 SDValue FalseOp = N->getOperand(0); 12600 SDValue TrueOp = N->getOperand(1); 12601 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 12602 SDValue Cond = N->getOperand(3); 12603 if (CC == X86::COND_E || CC == X86::COND_NE) { 12604 switch (Cond.getOpcode()) { 12605 default: break; 12606 case X86ISD::BSR: 12607 case X86ISD::BSF: 12608 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 12609 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 12610 return (CC == X86::COND_E) ? FalseOp : TrueOp; 12611 } 12612 } 12613 12614 // If this is a select between two integer constants, try to do some 12615 // optimizations. Note that the operands are ordered the opposite of SELECT 12616 // operands. 12617 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 12618 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 12619 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 12620 // larger than FalseC (the false value). 12621 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 12622 CC = X86::GetOppositeBranchCondition(CC); 12623 std::swap(TrueC, FalseC); 12624 } 12625 12626 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 12627 // This is efficient for any integer data type (including i8/i16) and 12628 // shift amount. 12629 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 12630 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12631 DAG.getConstant(CC, MVT::i8), Cond); 12632 12633 // Zero extend the condition if needed. 12634 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 12635 12636 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 12637 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 12638 DAG.getConstant(ShAmt, MVT::i8)); 12639 if (N->getNumValues() == 2) // Dead flag value? 12640 return DCI.CombineTo(N, Cond, SDValue()); 12641 return Cond; 12642 } 12643 12644 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 12645 // for any integer data type, including i8/i16. 12646 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 12647 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12648 DAG.getConstant(CC, MVT::i8), Cond); 12649 12650 // Zero extend the condition if needed. 12651 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 12652 FalseC->getValueType(0), Cond); 12653 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12654 SDValue(FalseC, 0)); 12655 12656 if (N->getNumValues() == 2) // Dead flag value? 12657 return DCI.CombineTo(N, Cond, SDValue()); 12658 return Cond; 12659 } 12660 12661 // Optimize cases that will turn into an LEA instruction. This requires 12662 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 12663 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 12664 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 12665 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 12666 12667 bool isFastMultiplier = false; 12668 if (Diff < 10) { 12669 switch ((unsigned char)Diff) { 12670 default: break; 12671 case 1: // result = add base, cond 12672 case 2: // result = lea base( , cond*2) 12673 case 3: // result = lea base(cond, cond*2) 12674 case 4: // result = lea base( , cond*4) 12675 case 5: // result = lea base(cond, cond*4) 12676 case 8: // result = lea base( , cond*8) 12677 case 9: // result = lea base(cond, cond*8) 12678 isFastMultiplier = true; 12679 break; 12680 } 12681 } 12682 12683 if (isFastMultiplier) { 12684 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 12685 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12686 DAG.getConstant(CC, MVT::i8), Cond); 12687 // Zero extend the condition if needed. 12688 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 12689 Cond); 12690 // Scale the condition by the difference. 12691 if (Diff != 1) 12692 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 12693 DAG.getConstant(Diff, Cond.getValueType())); 12694 12695 // Add the base if non-zero. 12696 if (FalseC->getAPIntValue() != 0) 12697 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12698 SDValue(FalseC, 0)); 12699 if (N->getNumValues() == 2) // Dead flag value? 12700 return DCI.CombineTo(N, Cond, SDValue()); 12701 return Cond; 12702 } 12703 } 12704 } 12705 } 12706 return SDValue(); 12707} 12708 12709 12710/// PerformMulCombine - Optimize a single multiply with constant into two 12711/// in order to implement it with two cheaper instructions, e.g. 12712/// LEA + SHL, LEA + LEA. 12713static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 12714 TargetLowering::DAGCombinerInfo &DCI) { 12715 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12716 return SDValue(); 12717 12718 EVT VT = N->getValueType(0); 12719 if (VT != MVT::i64) 12720 return SDValue(); 12721 12722 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12723 if (!C) 12724 return SDValue(); 12725 uint64_t MulAmt = C->getZExtValue(); 12726 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 12727 return SDValue(); 12728 12729 uint64_t MulAmt1 = 0; 12730 uint64_t MulAmt2 = 0; 12731 if ((MulAmt % 9) == 0) { 12732 MulAmt1 = 9; 12733 MulAmt2 = MulAmt / 9; 12734 } else if ((MulAmt % 5) == 0) { 12735 MulAmt1 = 5; 12736 MulAmt2 = MulAmt / 5; 12737 } else if ((MulAmt % 3) == 0) { 12738 MulAmt1 = 3; 12739 MulAmt2 = MulAmt / 3; 12740 } 12741 if (MulAmt2 && 12742 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 12743 DebugLoc DL = N->getDebugLoc(); 12744 12745 if (isPowerOf2_64(MulAmt2) && 12746 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 12747 // If second multiplifer is pow2, issue it first. We want the multiply by 12748 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 12749 // is an add. 12750 std::swap(MulAmt1, MulAmt2); 12751 12752 SDValue NewMul; 12753 if (isPowerOf2_64(MulAmt1)) 12754 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 12755 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 12756 else 12757 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 12758 DAG.getConstant(MulAmt1, VT)); 12759 12760 if (isPowerOf2_64(MulAmt2)) 12761 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 12762 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 12763 else 12764 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 12765 DAG.getConstant(MulAmt2, VT)); 12766 12767 // Do not add new nodes to DAG combiner worklist. 12768 DCI.CombineTo(N, NewMul, false); 12769 } 12770 return SDValue(); 12771} 12772 12773static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 12774 SDValue N0 = N->getOperand(0); 12775 SDValue N1 = N->getOperand(1); 12776 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12777 EVT VT = N0.getValueType(); 12778 12779 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 12780 // since the result of setcc_c is all zero's or all ones. 12781 if (N1C && N0.getOpcode() == ISD::AND && 12782 N0.getOperand(1).getOpcode() == ISD::Constant) { 12783 SDValue N00 = N0.getOperand(0); 12784 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 12785 ((N00.getOpcode() == ISD::ANY_EXTEND || 12786 N00.getOpcode() == ISD::ZERO_EXTEND) && 12787 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 12788 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 12789 APInt ShAmt = N1C->getAPIntValue(); 12790 Mask = Mask.shl(ShAmt); 12791 if (Mask != 0) 12792 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 12793 N00, DAG.getConstant(Mask, VT)); 12794 } 12795 } 12796 12797 return SDValue(); 12798} 12799 12800/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 12801/// when possible. 12802static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 12803 const X86Subtarget *Subtarget) { 12804 EVT VT = N->getValueType(0); 12805 if (!VT.isVector() && VT.isInteger() && 12806 N->getOpcode() == ISD::SHL) 12807 return PerformSHLCombine(N, DAG); 12808 12809 // On X86 with SSE2 support, we can transform this to a vector shift if 12810 // all elements are shifted by the same amount. We can't do this in legalize 12811 // because the a constant vector is typically transformed to a constant pool 12812 // so we have no knowledge of the shift amount. 12813 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 12814 return SDValue(); 12815 12816 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 12817 return SDValue(); 12818 12819 SDValue ShAmtOp = N->getOperand(1); 12820 EVT EltVT = VT.getVectorElementType(); 12821 DebugLoc DL = N->getDebugLoc(); 12822 SDValue BaseShAmt = SDValue(); 12823 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 12824 unsigned NumElts = VT.getVectorNumElements(); 12825 unsigned i = 0; 12826 for (; i != NumElts; ++i) { 12827 SDValue Arg = ShAmtOp.getOperand(i); 12828 if (Arg.getOpcode() == ISD::UNDEF) continue; 12829 BaseShAmt = Arg; 12830 break; 12831 } 12832 for (; i != NumElts; ++i) { 12833 SDValue Arg = ShAmtOp.getOperand(i); 12834 if (Arg.getOpcode() == ISD::UNDEF) continue; 12835 if (Arg != BaseShAmt) { 12836 return SDValue(); 12837 } 12838 } 12839 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 12840 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 12841 SDValue InVec = ShAmtOp.getOperand(0); 12842 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12843 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12844 unsigned i = 0; 12845 for (; i != NumElts; ++i) { 12846 SDValue Arg = InVec.getOperand(i); 12847 if (Arg.getOpcode() == ISD::UNDEF) continue; 12848 BaseShAmt = Arg; 12849 break; 12850 } 12851 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12852 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12853 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 12854 if (C->getZExtValue() == SplatIdx) 12855 BaseShAmt = InVec.getOperand(1); 12856 } 12857 } 12858 if (BaseShAmt.getNode() == 0) 12859 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 12860 DAG.getIntPtrConstant(0)); 12861 } else 12862 return SDValue(); 12863 12864 // The shift amount is an i32. 12865 if (EltVT.bitsGT(MVT::i32)) 12866 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 12867 else if (EltVT.bitsLT(MVT::i32)) 12868 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 12869 12870 // The shift amount is identical so we can do a vector shift. 12871 SDValue ValOp = N->getOperand(0); 12872 switch (N->getOpcode()) { 12873 default: 12874 llvm_unreachable("Unknown shift opcode!"); 12875 break; 12876 case ISD::SHL: 12877 if (VT == MVT::v2i64) 12878 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12879 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 12880 ValOp, BaseShAmt); 12881 if (VT == MVT::v4i32) 12882 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12883 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 12884 ValOp, BaseShAmt); 12885 if (VT == MVT::v8i16) 12886 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12887 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 12888 ValOp, BaseShAmt); 12889 break; 12890 case ISD::SRA: 12891 if (VT == MVT::v4i32) 12892 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12893 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 12894 ValOp, BaseShAmt); 12895 if (VT == MVT::v8i16) 12896 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12897 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 12898 ValOp, BaseShAmt); 12899 break; 12900 case ISD::SRL: 12901 if (VT == MVT::v2i64) 12902 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12903 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 12904 ValOp, BaseShAmt); 12905 if (VT == MVT::v4i32) 12906 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12907 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 12908 ValOp, BaseShAmt); 12909 if (VT == MVT::v8i16) 12910 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12911 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 12912 ValOp, BaseShAmt); 12913 break; 12914 } 12915 return SDValue(); 12916} 12917 12918 12919// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 12920// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 12921// and friends. Likewise for OR -> CMPNEQSS. 12922static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 12923 TargetLowering::DAGCombinerInfo &DCI, 12924 const X86Subtarget *Subtarget) { 12925 unsigned opcode; 12926 12927 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 12928 // we're requiring SSE2 for both. 12929 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 12930 SDValue N0 = N->getOperand(0); 12931 SDValue N1 = N->getOperand(1); 12932 SDValue CMP0 = N0->getOperand(1); 12933 SDValue CMP1 = N1->getOperand(1); 12934 DebugLoc DL = N->getDebugLoc(); 12935 12936 // The SETCCs should both refer to the same CMP. 12937 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 12938 return SDValue(); 12939 12940 SDValue CMP00 = CMP0->getOperand(0); 12941 SDValue CMP01 = CMP0->getOperand(1); 12942 EVT VT = CMP00.getValueType(); 12943 12944 if (VT == MVT::f32 || VT == MVT::f64) { 12945 bool ExpectingFlags = false; 12946 // Check for any users that want flags: 12947 for (SDNode::use_iterator UI = N->use_begin(), 12948 UE = N->use_end(); 12949 !ExpectingFlags && UI != UE; ++UI) 12950 switch (UI->getOpcode()) { 12951 default: 12952 case ISD::BR_CC: 12953 case ISD::BRCOND: 12954 case ISD::SELECT: 12955 ExpectingFlags = true; 12956 break; 12957 case ISD::CopyToReg: 12958 case ISD::SIGN_EXTEND: 12959 case ISD::ZERO_EXTEND: 12960 case ISD::ANY_EXTEND: 12961 break; 12962 } 12963 12964 if (!ExpectingFlags) { 12965 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 12966 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 12967 12968 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 12969 X86::CondCode tmp = cc0; 12970 cc0 = cc1; 12971 cc1 = tmp; 12972 } 12973 12974 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 12975 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 12976 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 12977 X86ISD::NodeType NTOperator = is64BitFP ? 12978 X86ISD::FSETCCsd : X86ISD::FSETCCss; 12979 // FIXME: need symbolic constants for these magic numbers. 12980 // See X86ATTInstPrinter.cpp:printSSECC(). 12981 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 12982 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 12983 DAG.getConstant(x86cc, MVT::i8)); 12984 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 12985 OnesOrZeroesF); 12986 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 12987 DAG.getConstant(1, MVT::i32)); 12988 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 12989 return OneBitOfTruth; 12990 } 12991 } 12992 } 12993 } 12994 return SDValue(); 12995} 12996 12997/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 12998/// so it can be folded inside ANDNP. 12999static bool CanFoldXORWithAllOnes(const SDNode *N) { 13000 EVT VT = N->getValueType(0); 13001 13002 // Match direct AllOnes for 128 and 256-bit vectors 13003 if (ISD::isBuildVectorAllOnes(N)) 13004 return true; 13005 13006 // Look through a bit convert. 13007 if (N->getOpcode() == ISD::BITCAST) 13008 N = N->getOperand(0).getNode(); 13009 13010 // Sometimes the operand may come from a insert_subvector building a 256-bit 13011 // allones vector 13012 if (VT.getSizeInBits() == 256 && 13013 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 13014 SDValue V1 = N->getOperand(0); 13015 SDValue V2 = N->getOperand(1); 13016 13017 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 13018 V1.getOperand(0).getOpcode() == ISD::UNDEF && 13019 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 13020 ISD::isBuildVectorAllOnes(V2.getNode())) 13021 return true; 13022 } 13023 13024 return false; 13025} 13026 13027static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 13028 TargetLowering::DAGCombinerInfo &DCI, 13029 const X86Subtarget *Subtarget) { 13030 if (DCI.isBeforeLegalizeOps()) 13031 return SDValue(); 13032 13033 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13034 if (R.getNode()) 13035 return R; 13036 13037 // Want to form ANDNP nodes: 13038 // 1) In the hopes of then easily combining them with OR and AND nodes 13039 // to form PBLEND/PSIGN. 13040 // 2) To match ANDN packed intrinsics 13041 EVT VT = N->getValueType(0); 13042 if (VT != MVT::v2i64 && VT != MVT::v4i64) 13043 return SDValue(); 13044 13045 SDValue N0 = N->getOperand(0); 13046 SDValue N1 = N->getOperand(1); 13047 DebugLoc DL = N->getDebugLoc(); 13048 13049 // Check LHS for vnot 13050 if (N0.getOpcode() == ISD::XOR && 13051 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 13052 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 13053 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 13054 13055 // Check RHS for vnot 13056 if (N1.getOpcode() == ISD::XOR && 13057 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 13058 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 13059 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 13060 13061 return SDValue(); 13062} 13063 13064static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 13065 TargetLowering::DAGCombinerInfo &DCI, 13066 const X86Subtarget *Subtarget) { 13067 if (DCI.isBeforeLegalizeOps()) 13068 return SDValue(); 13069 13070 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13071 if (R.getNode()) 13072 return R; 13073 13074 EVT VT = N->getValueType(0); 13075 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 13076 return SDValue(); 13077 13078 SDValue N0 = N->getOperand(0); 13079 SDValue N1 = N->getOperand(1); 13080 13081 // look for psign/blend 13082 if (Subtarget->hasSSSE3()) { 13083 if (VT == MVT::v2i64) { 13084 // Canonicalize pandn to RHS 13085 if (N0.getOpcode() == X86ISD::ANDNP) 13086 std::swap(N0, N1); 13087 // or (and (m, x), (pandn m, y)) 13088 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 13089 SDValue Mask = N1.getOperand(0); 13090 SDValue X = N1.getOperand(1); 13091 SDValue Y; 13092 if (N0.getOperand(0) == Mask) 13093 Y = N0.getOperand(1); 13094 if (N0.getOperand(1) == Mask) 13095 Y = N0.getOperand(0); 13096 13097 // Check to see if the mask appeared in both the AND and ANDNP and 13098 if (!Y.getNode()) 13099 return SDValue(); 13100 13101 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 13102 if (Mask.getOpcode() != ISD::BITCAST || 13103 X.getOpcode() != ISD::BITCAST || 13104 Y.getOpcode() != ISD::BITCAST) 13105 return SDValue(); 13106 13107 // Look through mask bitcast. 13108 Mask = Mask.getOperand(0); 13109 EVT MaskVT = Mask.getValueType(); 13110 13111 // Validate that the Mask operand is a vector sra node. The sra node 13112 // will be an intrinsic. 13113 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 13114 return SDValue(); 13115 13116 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 13117 // there is no psrai.b 13118 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 13119 case Intrinsic::x86_sse2_psrai_w: 13120 case Intrinsic::x86_sse2_psrai_d: 13121 break; 13122 default: return SDValue(); 13123 } 13124 13125 // Check that the SRA is all signbits. 13126 SDValue SraC = Mask.getOperand(2); 13127 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 13128 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 13129 if ((SraAmt + 1) != EltBits) 13130 return SDValue(); 13131 13132 DebugLoc DL = N->getDebugLoc(); 13133 13134 // Now we know we at least have a plendvb with the mask val. See if 13135 // we can form a psignb/w/d. 13136 // psign = x.type == y.type == mask.type && y = sub(0, x); 13137 X = X.getOperand(0); 13138 Y = Y.getOperand(0); 13139 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 13140 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 13141 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 13142 unsigned Opc = 0; 13143 switch (EltBits) { 13144 case 8: Opc = X86ISD::PSIGNB; break; 13145 case 16: Opc = X86ISD::PSIGNW; break; 13146 case 32: Opc = X86ISD::PSIGND; break; 13147 default: break; 13148 } 13149 if (Opc) { 13150 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 13151 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 13152 } 13153 } 13154 // PBLENDVB only available on SSE 4.1 13155 if (!Subtarget->hasSSE41()) 13156 return SDValue(); 13157 13158 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 13159 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 13160 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 13161 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 13162 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 13163 } 13164 } 13165 } 13166 13167 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 13168 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 13169 std::swap(N0, N1); 13170 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 13171 return SDValue(); 13172 if (!N0.hasOneUse() || !N1.hasOneUse()) 13173 return SDValue(); 13174 13175 SDValue ShAmt0 = N0.getOperand(1); 13176 if (ShAmt0.getValueType() != MVT::i8) 13177 return SDValue(); 13178 SDValue ShAmt1 = N1.getOperand(1); 13179 if (ShAmt1.getValueType() != MVT::i8) 13180 return SDValue(); 13181 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 13182 ShAmt0 = ShAmt0.getOperand(0); 13183 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 13184 ShAmt1 = ShAmt1.getOperand(0); 13185 13186 DebugLoc DL = N->getDebugLoc(); 13187 unsigned Opc = X86ISD::SHLD; 13188 SDValue Op0 = N0.getOperand(0); 13189 SDValue Op1 = N1.getOperand(0); 13190 if (ShAmt0.getOpcode() == ISD::SUB) { 13191 Opc = X86ISD::SHRD; 13192 std::swap(Op0, Op1); 13193 std::swap(ShAmt0, ShAmt1); 13194 } 13195 13196 unsigned Bits = VT.getSizeInBits(); 13197 if (ShAmt1.getOpcode() == ISD::SUB) { 13198 SDValue Sum = ShAmt1.getOperand(0); 13199 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 13200 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 13201 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 13202 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 13203 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 13204 return DAG.getNode(Opc, DL, VT, 13205 Op0, Op1, 13206 DAG.getNode(ISD::TRUNCATE, DL, 13207 MVT::i8, ShAmt0)); 13208 } 13209 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 13210 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 13211 if (ShAmt0C && 13212 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 13213 return DAG.getNode(Opc, DL, VT, 13214 N0.getOperand(0), N1.getOperand(0), 13215 DAG.getNode(ISD::TRUNCATE, DL, 13216 MVT::i8, ShAmt0)); 13217 } 13218 13219 return SDValue(); 13220} 13221 13222/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 13223static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 13224 const X86Subtarget *Subtarget) { 13225 StoreSDNode *St = cast<StoreSDNode>(N); 13226 EVT VT = St->getValue().getValueType(); 13227 EVT StVT = St->getMemoryVT(); 13228 DebugLoc dl = St->getDebugLoc(); 13229 SDValue StoredVal = St->getOperand(1); 13230 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13231 13232 // If we are saving a concatination of two XMM registers, perform two stores. 13233 // This is better in Sandy Bridge cause one 256-bit mem op is done via two 13234 // 128-bit ones. If in the future the cost becomes only one memory access the 13235 // first version would be better. 13236 if (VT.getSizeInBits() == 256 && 13237 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 13238 StoredVal.getNumOperands() == 2) { 13239 13240 SDValue Value0 = StoredVal.getOperand(0); 13241 SDValue Value1 = StoredVal.getOperand(1); 13242 13243 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 13244 SDValue Ptr0 = St->getBasePtr(); 13245 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 13246 13247 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 13248 St->getPointerInfo(), St->isVolatile(), 13249 St->isNonTemporal(), St->getAlignment()); 13250 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 13251 St->getPointerInfo(), St->isVolatile(), 13252 St->isNonTemporal(), St->getAlignment()); 13253 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 13254 } 13255 13256 // Optimize trunc store (of multiple scalars) to shuffle and store. 13257 // First, pack all of the elements in one place. Next, store to memory 13258 // in fewer chunks. 13259 if (St->isTruncatingStore() && VT.isVector()) { 13260 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13261 unsigned NumElems = VT.getVectorNumElements(); 13262 assert(StVT != VT && "Cannot truncate to the same type"); 13263 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 13264 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 13265 13266 // From, To sizes and ElemCount must be pow of two 13267 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 13268 // We are going to use the original vector elt for storing. 13269 // accumulated smaller vector elements must be a multiple of bigger size. 13270 if (0 != (NumElems * ToSz) % FromSz) return SDValue(); 13271 unsigned SizeRatio = FromSz / ToSz; 13272 13273 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 13274 13275 // Create a type on which we perform the shuffle 13276 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 13277 StVT.getScalarType(), NumElems*SizeRatio); 13278 13279 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13280 13281 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 13282 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13283 for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; 13284 13285 // Can't shuffle using an illegal type 13286 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 13287 13288 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 13289 DAG.getUNDEF(WideVec.getValueType()), 13290 ShuffleVec.data()); 13291 // At this point all of the data is stored at the bottom of the 13292 // register. We now need to save it to mem. 13293 13294 // Find the largest store unit 13295 MVT StoreType = MVT::i8; 13296 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 13297 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 13298 MVT Tp = (MVT::SimpleValueType)tp; 13299 if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) 13300 StoreType = Tp; 13301 } 13302 13303 // Bitcast the original vector into a vector of store-size units 13304 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 13305 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 13306 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13307 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 13308 SmallVector<SDValue, 8> Chains; 13309 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 13310 TLI.getPointerTy()); 13311 SDValue Ptr = St->getBasePtr(); 13312 13313 // Perform one or more big stores into memory. 13314 for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { 13315 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 13316 StoreType, ShuffWide, 13317 DAG.getIntPtrConstant(i)); 13318 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 13319 St->getPointerInfo(), St->isVolatile(), 13320 St->isNonTemporal(), St->getAlignment()); 13321 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 13322 Chains.push_back(Ch); 13323 } 13324 13325 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 13326 Chains.size()); 13327 } 13328 13329 13330 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 13331 // the FP state in cases where an emms may be missing. 13332 // A preferable solution to the general problem is to figure out the right 13333 // places to insert EMMS. This qualifies as a quick hack. 13334 13335 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 13336 if (VT.getSizeInBits() != 64) 13337 return SDValue(); 13338 13339 const Function *F = DAG.getMachineFunction().getFunction(); 13340 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 13341 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 13342 && Subtarget->hasSSE2(); 13343 if ((VT.isVector() || 13344 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 13345 isa<LoadSDNode>(St->getValue()) && 13346 !cast<LoadSDNode>(St->getValue())->isVolatile() && 13347 St->getChain().hasOneUse() && !St->isVolatile()) { 13348 SDNode* LdVal = St->getValue().getNode(); 13349 LoadSDNode *Ld = 0; 13350 int TokenFactorIndex = -1; 13351 SmallVector<SDValue, 8> Ops; 13352 SDNode* ChainVal = St->getChain().getNode(); 13353 // Must be a store of a load. We currently handle two cases: the load 13354 // is a direct child, and it's under an intervening TokenFactor. It is 13355 // possible to dig deeper under nested TokenFactors. 13356 if (ChainVal == LdVal) 13357 Ld = cast<LoadSDNode>(St->getChain()); 13358 else if (St->getValue().hasOneUse() && 13359 ChainVal->getOpcode() == ISD::TokenFactor) { 13360 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 13361 if (ChainVal->getOperand(i).getNode() == LdVal) { 13362 TokenFactorIndex = i; 13363 Ld = cast<LoadSDNode>(St->getValue()); 13364 } else 13365 Ops.push_back(ChainVal->getOperand(i)); 13366 } 13367 } 13368 13369 if (!Ld || !ISD::isNormalLoad(Ld)) 13370 return SDValue(); 13371 13372 // If this is not the MMX case, i.e. we are just turning i64 load/store 13373 // into f64 load/store, avoid the transformation if there are multiple 13374 // uses of the loaded value. 13375 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 13376 return SDValue(); 13377 13378 DebugLoc LdDL = Ld->getDebugLoc(); 13379 DebugLoc StDL = N->getDebugLoc(); 13380 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 13381 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 13382 // pair instead. 13383 if (Subtarget->is64Bit() || F64IsLegal) { 13384 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 13385 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 13386 Ld->getPointerInfo(), Ld->isVolatile(), 13387 Ld->isNonTemporal(), Ld->getAlignment()); 13388 SDValue NewChain = NewLd.getValue(1); 13389 if (TokenFactorIndex != -1) { 13390 Ops.push_back(NewChain); 13391 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 13392 Ops.size()); 13393 } 13394 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 13395 St->getPointerInfo(), 13396 St->isVolatile(), St->isNonTemporal(), 13397 St->getAlignment()); 13398 } 13399 13400 // Otherwise, lower to two pairs of 32-bit loads / stores. 13401 SDValue LoAddr = Ld->getBasePtr(); 13402 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 13403 DAG.getConstant(4, MVT::i32)); 13404 13405 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 13406 Ld->getPointerInfo(), 13407 Ld->isVolatile(), Ld->isNonTemporal(), 13408 Ld->getAlignment()); 13409 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 13410 Ld->getPointerInfo().getWithOffset(4), 13411 Ld->isVolatile(), Ld->isNonTemporal(), 13412 MinAlign(Ld->getAlignment(), 4)); 13413 13414 SDValue NewChain = LoLd.getValue(1); 13415 if (TokenFactorIndex != -1) { 13416 Ops.push_back(LoLd); 13417 Ops.push_back(HiLd); 13418 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 13419 Ops.size()); 13420 } 13421 13422 LoAddr = St->getBasePtr(); 13423 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 13424 DAG.getConstant(4, MVT::i32)); 13425 13426 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 13427 St->getPointerInfo(), 13428 St->isVolatile(), St->isNonTemporal(), 13429 St->getAlignment()); 13430 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 13431 St->getPointerInfo().getWithOffset(4), 13432 St->isVolatile(), 13433 St->isNonTemporal(), 13434 MinAlign(St->getAlignment(), 4)); 13435 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 13436 } 13437 return SDValue(); 13438} 13439 13440/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 13441/// X86ISD::FXOR nodes. 13442static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 13443 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 13444 // F[X]OR(0.0, x) -> x 13445 // F[X]OR(x, 0.0) -> x 13446 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 13447 if (C->getValueAPF().isPosZero()) 13448 return N->getOperand(1); 13449 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 13450 if (C->getValueAPF().isPosZero()) 13451 return N->getOperand(0); 13452 return SDValue(); 13453} 13454 13455/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 13456static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 13457 // FAND(0.0, x) -> 0.0 13458 // FAND(x, 0.0) -> 0.0 13459 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 13460 if (C->getValueAPF().isPosZero()) 13461 return N->getOperand(0); 13462 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 13463 if (C->getValueAPF().isPosZero()) 13464 return N->getOperand(1); 13465 return SDValue(); 13466} 13467 13468static SDValue PerformBTCombine(SDNode *N, 13469 SelectionDAG &DAG, 13470 TargetLowering::DAGCombinerInfo &DCI) { 13471 // BT ignores high bits in the bit index operand. 13472 SDValue Op1 = N->getOperand(1); 13473 if (Op1.hasOneUse()) { 13474 unsigned BitWidth = Op1.getValueSizeInBits(); 13475 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 13476 APInt KnownZero, KnownOne; 13477 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 13478 !DCI.isBeforeLegalizeOps()); 13479 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13480 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 13481 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 13482 DCI.CommitTargetLoweringOpt(TLO); 13483 } 13484 return SDValue(); 13485} 13486 13487static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 13488 SDValue Op = N->getOperand(0); 13489 if (Op.getOpcode() == ISD::BITCAST) 13490 Op = Op.getOperand(0); 13491 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 13492 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 13493 VT.getVectorElementType().getSizeInBits() == 13494 OpVT.getVectorElementType().getSizeInBits()) { 13495 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 13496 } 13497 return SDValue(); 13498} 13499 13500static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 13501 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 13502 // (and (i32 x86isd::setcc_carry), 1) 13503 // This eliminates the zext. This transformation is necessary because 13504 // ISD::SETCC is always legalized to i8. 13505 DebugLoc dl = N->getDebugLoc(); 13506 SDValue N0 = N->getOperand(0); 13507 EVT VT = N->getValueType(0); 13508 if (N0.getOpcode() == ISD::AND && 13509 N0.hasOneUse() && 13510 N0.getOperand(0).hasOneUse()) { 13511 SDValue N00 = N0.getOperand(0); 13512 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 13513 return SDValue(); 13514 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 13515 if (!C || C->getZExtValue() != 1) 13516 return SDValue(); 13517 return DAG.getNode(ISD::AND, dl, VT, 13518 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 13519 N00.getOperand(0), N00.getOperand(1)), 13520 DAG.getConstant(1, VT)); 13521 } 13522 13523 return SDValue(); 13524} 13525 13526// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 13527static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 13528 unsigned X86CC = N->getConstantOperandVal(0); 13529 SDValue EFLAG = N->getOperand(1); 13530 DebugLoc DL = N->getDebugLoc(); 13531 13532 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 13533 // a zext and produces an all-ones bit which is more useful than 0/1 in some 13534 // cases. 13535 if (X86CC == X86::COND_B) 13536 return DAG.getNode(ISD::AND, DL, MVT::i8, 13537 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 13538 DAG.getConstant(X86CC, MVT::i8), EFLAG), 13539 DAG.getConstant(1, MVT::i8)); 13540 13541 return SDValue(); 13542} 13543 13544static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 13545 const X86TargetLowering *XTLI) { 13546 SDValue Op0 = N->getOperand(0); 13547 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 13548 // a 32-bit target where SSE doesn't support i64->FP operations. 13549 if (Op0.getOpcode() == ISD::LOAD) { 13550 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 13551 EVT VT = Ld->getValueType(0); 13552 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 13553 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 13554 !XTLI->getSubtarget()->is64Bit() && 13555 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13556 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 13557 Ld->getChain(), Op0, DAG); 13558 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 13559 return FILDChain; 13560 } 13561 } 13562 return SDValue(); 13563} 13564 13565// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 13566static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 13567 X86TargetLowering::DAGCombinerInfo &DCI) { 13568 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 13569 // the result is either zero or one (depending on the input carry bit). 13570 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 13571 if (X86::isZeroNode(N->getOperand(0)) && 13572 X86::isZeroNode(N->getOperand(1)) && 13573 // We don't have a good way to replace an EFLAGS use, so only do this when 13574 // dead right now. 13575 SDValue(N, 1).use_empty()) { 13576 DebugLoc DL = N->getDebugLoc(); 13577 EVT VT = N->getValueType(0); 13578 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 13579 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 13580 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 13581 DAG.getConstant(X86::COND_B,MVT::i8), 13582 N->getOperand(2)), 13583 DAG.getConstant(1, VT)); 13584 return DCI.CombineTo(N, Res1, CarryOut); 13585 } 13586 13587 return SDValue(); 13588} 13589 13590// fold (add Y, (sete X, 0)) -> adc 0, Y 13591// (add Y, (setne X, 0)) -> sbb -1, Y 13592// (sub (sete X, 0), Y) -> sbb 0, Y 13593// (sub (setne X, 0), Y) -> adc -1, Y 13594static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 13595 DebugLoc DL = N->getDebugLoc(); 13596 13597 // Look through ZExts. 13598 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 13599 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 13600 return SDValue(); 13601 13602 SDValue SetCC = Ext.getOperand(0); 13603 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 13604 return SDValue(); 13605 13606 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 13607 if (CC != X86::COND_E && CC != X86::COND_NE) 13608 return SDValue(); 13609 13610 SDValue Cmp = SetCC.getOperand(1); 13611 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 13612 !X86::isZeroNode(Cmp.getOperand(1)) || 13613 !Cmp.getOperand(0).getValueType().isInteger()) 13614 return SDValue(); 13615 13616 SDValue CmpOp0 = Cmp.getOperand(0); 13617 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 13618 DAG.getConstant(1, CmpOp0.getValueType())); 13619 13620 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 13621 if (CC == X86::COND_NE) 13622 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 13623 DL, OtherVal.getValueType(), OtherVal, 13624 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 13625 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 13626 DL, OtherVal.getValueType(), OtherVal, 13627 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 13628} 13629 13630static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { 13631 SDValue Op0 = N->getOperand(0); 13632 SDValue Op1 = N->getOperand(1); 13633 13634 // X86 can't encode an immediate LHS of a sub. See if we can push the 13635 // negation into a preceding instruction. 13636 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 13637 // If the RHS of the sub is a XOR with one use and a constant, invert the 13638 // immediate. Then add one to the LHS of the sub so we can turn 13639 // X-Y -> X+~Y+1, saving one register. 13640 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 13641 isa<ConstantSDNode>(Op1.getOperand(1))) { 13642 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 13643 EVT VT = Op0.getValueType(); 13644 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 13645 Op1.getOperand(0), 13646 DAG.getConstant(~XorC, VT)); 13647 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 13648 DAG.getConstant(C->getAPIntValue()+1, VT)); 13649 } 13650 } 13651 13652 return OptimizeConditionalInDecrement(N, DAG); 13653} 13654 13655SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 13656 DAGCombinerInfo &DCI) const { 13657 SelectionDAG &DAG = DCI.DAG; 13658 switch (N->getOpcode()) { 13659 default: break; 13660 case ISD::EXTRACT_VECTOR_ELT: 13661 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 13662 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 13663 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 13664 case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); 13665 case ISD::SUB: return PerformSubCombine(N, DAG); 13666 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 13667 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 13668 case ISD::SHL: 13669 case ISD::SRA: 13670 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 13671 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 13672 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 13673 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 13674 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 13675 case X86ISD::FXOR: 13676 case X86ISD::FOR: return PerformFORCombine(N, DAG); 13677 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 13678 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 13679 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 13680 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 13681 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 13682 case X86ISD::SHUFPS: // Handle all target specific shuffles 13683 case X86ISD::SHUFPD: 13684 case X86ISD::PALIGN: 13685 case X86ISD::PUNPCKHBW: 13686 case X86ISD::PUNPCKHWD: 13687 case X86ISD::PUNPCKHDQ: 13688 case X86ISD::PUNPCKHQDQ: 13689 case X86ISD::UNPCKHPS: 13690 case X86ISD::UNPCKHPD: 13691 case X86ISD::VUNPCKHPSY: 13692 case X86ISD::VUNPCKHPDY: 13693 case X86ISD::PUNPCKLBW: 13694 case X86ISD::PUNPCKLWD: 13695 case X86ISD::PUNPCKLDQ: 13696 case X86ISD::PUNPCKLQDQ: 13697 case X86ISD::UNPCKLPS: 13698 case X86ISD::UNPCKLPD: 13699 case X86ISD::VUNPCKLPSY: 13700 case X86ISD::VUNPCKLPDY: 13701 case X86ISD::MOVHLPS: 13702 case X86ISD::MOVLHPS: 13703 case X86ISD::PSHUFD: 13704 case X86ISD::PSHUFHW: 13705 case X86ISD::PSHUFLW: 13706 case X86ISD::MOVSS: 13707 case X86ISD::MOVSD: 13708 case X86ISD::VPERMILPS: 13709 case X86ISD::VPERMILPSY: 13710 case X86ISD::VPERMILPD: 13711 case X86ISD::VPERMILPDY: 13712 case X86ISD::VPERM2F128: 13713 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 13714 } 13715 13716 return SDValue(); 13717} 13718 13719/// isTypeDesirableForOp - Return true if the target has native support for 13720/// the specified value type and it is 'desirable' to use the type for the 13721/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 13722/// instruction encodings are longer and some i16 instructions are slow. 13723bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 13724 if (!isTypeLegal(VT)) 13725 return false; 13726 if (VT != MVT::i16) 13727 return true; 13728 13729 switch (Opc) { 13730 default: 13731 return true; 13732 case ISD::LOAD: 13733 case ISD::SIGN_EXTEND: 13734 case ISD::ZERO_EXTEND: 13735 case ISD::ANY_EXTEND: 13736 case ISD::SHL: 13737 case ISD::SRL: 13738 case ISD::SUB: 13739 case ISD::ADD: 13740 case ISD::MUL: 13741 case ISD::AND: 13742 case ISD::OR: 13743 case ISD::XOR: 13744 return false; 13745 } 13746} 13747 13748/// IsDesirableToPromoteOp - This method query the target whether it is 13749/// beneficial for dag combiner to promote the specified node. If true, it 13750/// should return the desired promotion type by reference. 13751bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 13752 EVT VT = Op.getValueType(); 13753 if (VT != MVT::i16) 13754 return false; 13755 13756 bool Promote = false; 13757 bool Commute = false; 13758 switch (Op.getOpcode()) { 13759 default: break; 13760 case ISD::LOAD: { 13761 LoadSDNode *LD = cast<LoadSDNode>(Op); 13762 // If the non-extending load has a single use and it's not live out, then it 13763 // might be folded. 13764 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 13765 Op.hasOneUse()*/) { 13766 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 13767 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 13768 // The only case where we'd want to promote LOAD (rather then it being 13769 // promoted as an operand is when it's only use is liveout. 13770 if (UI->getOpcode() != ISD::CopyToReg) 13771 return false; 13772 } 13773 } 13774 Promote = true; 13775 break; 13776 } 13777 case ISD::SIGN_EXTEND: 13778 case ISD::ZERO_EXTEND: 13779 case ISD::ANY_EXTEND: 13780 Promote = true; 13781 break; 13782 case ISD::SHL: 13783 case ISD::SRL: { 13784 SDValue N0 = Op.getOperand(0); 13785 // Look out for (store (shl (load), x)). 13786 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 13787 return false; 13788 Promote = true; 13789 break; 13790 } 13791 case ISD::ADD: 13792 case ISD::MUL: 13793 case ISD::AND: 13794 case ISD::OR: 13795 case ISD::XOR: 13796 Commute = true; 13797 // fallthrough 13798 case ISD::SUB: { 13799 SDValue N0 = Op.getOperand(0); 13800 SDValue N1 = Op.getOperand(1); 13801 if (!Commute && MayFoldLoad(N1)) 13802 return false; 13803 // Avoid disabling potential load folding opportunities. 13804 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 13805 return false; 13806 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 13807 return false; 13808 Promote = true; 13809 } 13810 } 13811 13812 PVT = MVT::i32; 13813 return Promote; 13814} 13815 13816//===----------------------------------------------------------------------===// 13817// X86 Inline Assembly Support 13818//===----------------------------------------------------------------------===// 13819 13820bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 13821 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13822 13823 std::string AsmStr = IA->getAsmString(); 13824 13825 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 13826 SmallVector<StringRef, 4> AsmPieces; 13827 SplitString(AsmStr, AsmPieces, ";\n"); 13828 13829 switch (AsmPieces.size()) { 13830 default: return false; 13831 case 1: 13832 AsmStr = AsmPieces[0]; 13833 AsmPieces.clear(); 13834 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 13835 13836 // FIXME: this should verify that we are targeting a 486 or better. If not, 13837 // we will turn this bswap into something that will be lowered to logical ops 13838 // instead of emitting the bswap asm. For now, we don't support 486 or lower 13839 // so don't worry about this. 13840 // bswap $0 13841 if (AsmPieces.size() == 2 && 13842 (AsmPieces[0] == "bswap" || 13843 AsmPieces[0] == "bswapq" || 13844 AsmPieces[0] == "bswapl") && 13845 (AsmPieces[1] == "$0" || 13846 AsmPieces[1] == "${0:q}")) { 13847 // No need to check constraints, nothing other than the equivalent of 13848 // "=r,0" would be valid here. 13849 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13850 if (!Ty || Ty->getBitWidth() % 16 != 0) 13851 return false; 13852 return IntrinsicLowering::LowerToByteSwap(CI); 13853 } 13854 // rorw $$8, ${0:w} --> llvm.bswap.i16 13855 if (CI->getType()->isIntegerTy(16) && 13856 AsmPieces.size() == 3 && 13857 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 13858 AsmPieces[1] == "$$8," && 13859 AsmPieces[2] == "${0:w}" && 13860 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13861 AsmPieces.clear(); 13862 const std::string &ConstraintsStr = IA->getConstraintString(); 13863 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13864 std::sort(AsmPieces.begin(), AsmPieces.end()); 13865 if (AsmPieces.size() == 4 && 13866 AsmPieces[0] == "~{cc}" && 13867 AsmPieces[1] == "~{dirflag}" && 13868 AsmPieces[2] == "~{flags}" && 13869 AsmPieces[3] == "~{fpsr}") { 13870 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13871 if (!Ty || Ty->getBitWidth() % 16 != 0) 13872 return false; 13873 return IntrinsicLowering::LowerToByteSwap(CI); 13874 } 13875 } 13876 break; 13877 case 3: 13878 if (CI->getType()->isIntegerTy(32) && 13879 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13880 SmallVector<StringRef, 4> Words; 13881 SplitString(AsmPieces[0], Words, " \t,"); 13882 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13883 Words[2] == "${0:w}") { 13884 Words.clear(); 13885 SplitString(AsmPieces[1], Words, " \t,"); 13886 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 13887 Words[2] == "$0") { 13888 Words.clear(); 13889 SplitString(AsmPieces[2], Words, " \t,"); 13890 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13891 Words[2] == "${0:w}") { 13892 AsmPieces.clear(); 13893 const std::string &ConstraintsStr = IA->getConstraintString(); 13894 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13895 std::sort(AsmPieces.begin(), AsmPieces.end()); 13896 if (AsmPieces.size() == 4 && 13897 AsmPieces[0] == "~{cc}" && 13898 AsmPieces[1] == "~{dirflag}" && 13899 AsmPieces[2] == "~{flags}" && 13900 AsmPieces[3] == "~{fpsr}") { 13901 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13902 if (!Ty || Ty->getBitWidth() % 16 != 0) 13903 return false; 13904 return IntrinsicLowering::LowerToByteSwap(CI); 13905 } 13906 } 13907 } 13908 } 13909 } 13910 13911 if (CI->getType()->isIntegerTy(64)) { 13912 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 13913 if (Constraints.size() >= 2 && 13914 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 13915 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 13916 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 13917 SmallVector<StringRef, 4> Words; 13918 SplitString(AsmPieces[0], Words, " \t"); 13919 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 13920 Words.clear(); 13921 SplitString(AsmPieces[1], Words, " \t"); 13922 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 13923 Words.clear(); 13924 SplitString(AsmPieces[2], Words, " \t,"); 13925 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 13926 Words[2] == "%edx") { 13927 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13928 if (!Ty || Ty->getBitWidth() % 16 != 0) 13929 return false; 13930 return IntrinsicLowering::LowerToByteSwap(CI); 13931 } 13932 } 13933 } 13934 } 13935 } 13936 break; 13937 } 13938 return false; 13939} 13940 13941 13942 13943/// getConstraintType - Given a constraint letter, return the type of 13944/// constraint it is for this target. 13945X86TargetLowering::ConstraintType 13946X86TargetLowering::getConstraintType(const std::string &Constraint) const { 13947 if (Constraint.size() == 1) { 13948 switch (Constraint[0]) { 13949 case 'R': 13950 case 'q': 13951 case 'Q': 13952 case 'f': 13953 case 't': 13954 case 'u': 13955 case 'y': 13956 case 'x': 13957 case 'Y': 13958 case 'l': 13959 return C_RegisterClass; 13960 case 'a': 13961 case 'b': 13962 case 'c': 13963 case 'd': 13964 case 'S': 13965 case 'D': 13966 case 'A': 13967 return C_Register; 13968 case 'I': 13969 case 'J': 13970 case 'K': 13971 case 'L': 13972 case 'M': 13973 case 'N': 13974 case 'G': 13975 case 'C': 13976 case 'e': 13977 case 'Z': 13978 return C_Other; 13979 default: 13980 break; 13981 } 13982 } 13983 return TargetLowering::getConstraintType(Constraint); 13984} 13985 13986/// Examine constraint type and operand type and determine a weight value. 13987/// This object must already have been set up with the operand type 13988/// and the current alternative constraint selected. 13989TargetLowering::ConstraintWeight 13990 X86TargetLowering::getSingleConstraintMatchWeight( 13991 AsmOperandInfo &info, const char *constraint) const { 13992 ConstraintWeight weight = CW_Invalid; 13993 Value *CallOperandVal = info.CallOperandVal; 13994 // If we don't have a value, we can't do a match, 13995 // but allow it at the lowest weight. 13996 if (CallOperandVal == NULL) 13997 return CW_Default; 13998 Type *type = CallOperandVal->getType(); 13999 // Look at the constraint type. 14000 switch (*constraint) { 14001 default: 14002 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 14003 case 'R': 14004 case 'q': 14005 case 'Q': 14006 case 'a': 14007 case 'b': 14008 case 'c': 14009 case 'd': 14010 case 'S': 14011 case 'D': 14012 case 'A': 14013 if (CallOperandVal->getType()->isIntegerTy()) 14014 weight = CW_SpecificReg; 14015 break; 14016 case 'f': 14017 case 't': 14018 case 'u': 14019 if (type->isFloatingPointTy()) 14020 weight = CW_SpecificReg; 14021 break; 14022 case 'y': 14023 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 14024 weight = CW_SpecificReg; 14025 break; 14026 case 'x': 14027 case 'Y': 14028 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 14029 weight = CW_Register; 14030 break; 14031 case 'I': 14032 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 14033 if (C->getZExtValue() <= 31) 14034 weight = CW_Constant; 14035 } 14036 break; 14037 case 'J': 14038 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14039 if (C->getZExtValue() <= 63) 14040 weight = CW_Constant; 14041 } 14042 break; 14043 case 'K': 14044 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14045 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 14046 weight = CW_Constant; 14047 } 14048 break; 14049 case 'L': 14050 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14051 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 14052 weight = CW_Constant; 14053 } 14054 break; 14055 case 'M': 14056 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14057 if (C->getZExtValue() <= 3) 14058 weight = CW_Constant; 14059 } 14060 break; 14061 case 'N': 14062 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14063 if (C->getZExtValue() <= 0xff) 14064 weight = CW_Constant; 14065 } 14066 break; 14067 case 'G': 14068 case 'C': 14069 if (dyn_cast<ConstantFP>(CallOperandVal)) { 14070 weight = CW_Constant; 14071 } 14072 break; 14073 case 'e': 14074 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14075 if ((C->getSExtValue() >= -0x80000000LL) && 14076 (C->getSExtValue() <= 0x7fffffffLL)) 14077 weight = CW_Constant; 14078 } 14079 break; 14080 case 'Z': 14081 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14082 if (C->getZExtValue() <= 0xffffffff) 14083 weight = CW_Constant; 14084 } 14085 break; 14086 } 14087 return weight; 14088} 14089 14090/// LowerXConstraint - try to replace an X constraint, which matches anything, 14091/// with another that has more specific requirements based on the type of the 14092/// corresponding operand. 14093const char *X86TargetLowering:: 14094LowerXConstraint(EVT ConstraintVT) const { 14095 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 14096 // 'f' like normal targets. 14097 if (ConstraintVT.isFloatingPoint()) { 14098 if (Subtarget->hasXMMInt()) 14099 return "Y"; 14100 if (Subtarget->hasXMM()) 14101 return "x"; 14102 } 14103 14104 return TargetLowering::LowerXConstraint(ConstraintVT); 14105} 14106 14107/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 14108/// vector. If it is invalid, don't add anything to Ops. 14109void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 14110 std::string &Constraint, 14111 std::vector<SDValue>&Ops, 14112 SelectionDAG &DAG) const { 14113 SDValue Result(0, 0); 14114 14115 // Only support length 1 constraints for now. 14116 if (Constraint.length() > 1) return; 14117 14118 char ConstraintLetter = Constraint[0]; 14119 switch (ConstraintLetter) { 14120 default: break; 14121 case 'I': 14122 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14123 if (C->getZExtValue() <= 31) { 14124 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 14125 break; 14126 } 14127 } 14128 return; 14129 case 'J': 14130 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14131 if (C->getZExtValue() <= 63) { 14132 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 14133 break; 14134 } 14135 } 14136 return; 14137 case 'K': 14138 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14139 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 14140 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 14141 break; 14142 } 14143 } 14144 return; 14145 case 'N': 14146 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14147 if (C->getZExtValue() <= 255) { 14148 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 14149 break; 14150 } 14151 } 14152 return; 14153 case 'e': { 14154 // 32-bit signed value 14155 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14156 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 14157 C->getSExtValue())) { 14158 // Widen to 64 bits here to get it sign extended. 14159 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 14160 break; 14161 } 14162 // FIXME gcc accepts some relocatable values here too, but only in certain 14163 // memory models; it's complicated. 14164 } 14165 return; 14166 } 14167 case 'Z': { 14168 // 32-bit unsigned value 14169 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 14170 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 14171 C->getZExtValue())) { 14172 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 14173 break; 14174 } 14175 } 14176 // FIXME gcc accepts some relocatable values here too, but only in certain 14177 // memory models; it's complicated. 14178 return; 14179 } 14180 case 'i': { 14181 // Literal immediates are always ok. 14182 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 14183 // Widen to 64 bits here to get it sign extended. 14184 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 14185 break; 14186 } 14187 14188 // In any sort of PIC mode addresses need to be computed at runtime by 14189 // adding in a register or some sort of table lookup. These can't 14190 // be used as immediates. 14191 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 14192 return; 14193 14194 // If we are in non-pic codegen mode, we allow the address of a global (with 14195 // an optional displacement) to be used with 'i'. 14196 GlobalAddressSDNode *GA = 0; 14197 int64_t Offset = 0; 14198 14199 // Match either (GA), (GA+C), (GA+C1+C2), etc. 14200 while (1) { 14201 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 14202 Offset += GA->getOffset(); 14203 break; 14204 } else if (Op.getOpcode() == ISD::ADD) { 14205 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 14206 Offset += C->getZExtValue(); 14207 Op = Op.getOperand(0); 14208 continue; 14209 } 14210 } else if (Op.getOpcode() == ISD::SUB) { 14211 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 14212 Offset += -C->getZExtValue(); 14213 Op = Op.getOperand(0); 14214 continue; 14215 } 14216 } 14217 14218 // Otherwise, this isn't something we can handle, reject it. 14219 return; 14220 } 14221 14222 const GlobalValue *GV = GA->getGlobal(); 14223 // If we require an extra load to get this address, as in PIC mode, we 14224 // can't accept it. 14225 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 14226 getTargetMachine()))) 14227 return; 14228 14229 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 14230 GA->getValueType(0), Offset); 14231 break; 14232 } 14233 } 14234 14235 if (Result.getNode()) { 14236 Ops.push_back(Result); 14237 return; 14238 } 14239 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 14240} 14241 14242std::pair<unsigned, const TargetRegisterClass*> 14243X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 14244 EVT VT) const { 14245 // First, see if this is a constraint that directly corresponds to an LLVM 14246 // register class. 14247 if (Constraint.size() == 1) { 14248 // GCC Constraint Letters 14249 switch (Constraint[0]) { 14250 default: break; 14251 // TODO: Slight differences here in allocation order and leaving 14252 // RIP in the class. Do they matter any more here than they do 14253 // in the normal allocation? 14254 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 14255 if (Subtarget->is64Bit()) { 14256 if (VT == MVT::i32 || VT == MVT::f32) 14257 return std::make_pair(0U, X86::GR32RegisterClass); 14258 else if (VT == MVT::i16) 14259 return std::make_pair(0U, X86::GR16RegisterClass); 14260 else if (VT == MVT::i8 || VT == MVT::i1) 14261 return std::make_pair(0U, X86::GR8RegisterClass); 14262 else if (VT == MVT::i64 || VT == MVT::f64) 14263 return std::make_pair(0U, X86::GR64RegisterClass); 14264 break; 14265 } 14266 // 32-bit fallthrough 14267 case 'Q': // Q_REGS 14268 if (VT == MVT::i32 || VT == MVT::f32) 14269 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 14270 else if (VT == MVT::i16) 14271 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 14272 else if (VT == MVT::i8 || VT == MVT::i1) 14273 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 14274 else if (VT == MVT::i64) 14275 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 14276 break; 14277 case 'r': // GENERAL_REGS 14278 case 'l': // INDEX_REGS 14279 if (VT == MVT::i8 || VT == MVT::i1) 14280 return std::make_pair(0U, X86::GR8RegisterClass); 14281 if (VT == MVT::i16) 14282 return std::make_pair(0U, X86::GR16RegisterClass); 14283 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 14284 return std::make_pair(0U, X86::GR32RegisterClass); 14285 return std::make_pair(0U, X86::GR64RegisterClass); 14286 case 'R': // LEGACY_REGS 14287 if (VT == MVT::i8 || VT == MVT::i1) 14288 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 14289 if (VT == MVT::i16) 14290 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 14291 if (VT == MVT::i32 || !Subtarget->is64Bit()) 14292 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 14293 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 14294 case 'f': // FP Stack registers. 14295 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 14296 // value to the correct fpstack register class. 14297 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 14298 return std::make_pair(0U, X86::RFP32RegisterClass); 14299 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 14300 return std::make_pair(0U, X86::RFP64RegisterClass); 14301 return std::make_pair(0U, X86::RFP80RegisterClass); 14302 case 'y': // MMX_REGS if MMX allowed. 14303 if (!Subtarget->hasMMX()) break; 14304 return std::make_pair(0U, X86::VR64RegisterClass); 14305 case 'Y': // SSE_REGS if SSE2 allowed 14306 if (!Subtarget->hasXMMInt()) break; 14307 // FALL THROUGH. 14308 case 'x': // SSE_REGS if SSE1 allowed 14309 if (!Subtarget->hasXMM()) break; 14310 14311 switch (VT.getSimpleVT().SimpleTy) { 14312 default: break; 14313 // Scalar SSE types. 14314 case MVT::f32: 14315 case MVT::i32: 14316 return std::make_pair(0U, X86::FR32RegisterClass); 14317 case MVT::f64: 14318 case MVT::i64: 14319 return std::make_pair(0U, X86::FR64RegisterClass); 14320 // Vector types. 14321 case MVT::v16i8: 14322 case MVT::v8i16: 14323 case MVT::v4i32: 14324 case MVT::v2i64: 14325 case MVT::v4f32: 14326 case MVT::v2f64: 14327 return std::make_pair(0U, X86::VR128RegisterClass); 14328 } 14329 break; 14330 } 14331 } 14332 14333 // Use the default implementation in TargetLowering to convert the register 14334 // constraint into a member of a register class. 14335 std::pair<unsigned, const TargetRegisterClass*> Res; 14336 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 14337 14338 // Not found as a standard register? 14339 if (Res.second == 0) { 14340 // Map st(0) -> st(7) -> ST0 14341 if (Constraint.size() == 7 && Constraint[0] == '{' && 14342 tolower(Constraint[1]) == 's' && 14343 tolower(Constraint[2]) == 't' && 14344 Constraint[3] == '(' && 14345 (Constraint[4] >= '0' && Constraint[4] <= '7') && 14346 Constraint[5] == ')' && 14347 Constraint[6] == '}') { 14348 14349 Res.first = X86::ST0+Constraint[4]-'0'; 14350 Res.second = X86::RFP80RegisterClass; 14351 return Res; 14352 } 14353 14354 // GCC allows "st(0)" to be called just plain "st". 14355 if (StringRef("{st}").equals_lower(Constraint)) { 14356 Res.first = X86::ST0; 14357 Res.second = X86::RFP80RegisterClass; 14358 return Res; 14359 } 14360 14361 // flags -> EFLAGS 14362 if (StringRef("{flags}").equals_lower(Constraint)) { 14363 Res.first = X86::EFLAGS; 14364 Res.second = X86::CCRRegisterClass; 14365 return Res; 14366 } 14367 14368 // 'A' means EAX + EDX. 14369 if (Constraint == "A") { 14370 Res.first = X86::EAX; 14371 Res.second = X86::GR32_ADRegisterClass; 14372 return Res; 14373 } 14374 return Res; 14375 } 14376 14377 // Otherwise, check to see if this is a register class of the wrong value 14378 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 14379 // turn into {ax},{dx}. 14380 if (Res.second->hasType(VT)) 14381 return Res; // Correct type already, nothing to do. 14382 14383 // All of the single-register GCC register classes map their values onto 14384 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 14385 // really want an 8-bit or 32-bit register, map to the appropriate register 14386 // class and return the appropriate register. 14387 if (Res.second == X86::GR16RegisterClass) { 14388 if (VT == MVT::i8) { 14389 unsigned DestReg = 0; 14390 switch (Res.first) { 14391 default: break; 14392 case X86::AX: DestReg = X86::AL; break; 14393 case X86::DX: DestReg = X86::DL; break; 14394 case X86::CX: DestReg = X86::CL; break; 14395 case X86::BX: DestReg = X86::BL; break; 14396 } 14397 if (DestReg) { 14398 Res.first = DestReg; 14399 Res.second = X86::GR8RegisterClass; 14400 } 14401 } else if (VT == MVT::i32) { 14402 unsigned DestReg = 0; 14403 switch (Res.first) { 14404 default: break; 14405 case X86::AX: DestReg = X86::EAX; break; 14406 case X86::DX: DestReg = X86::EDX; break; 14407 case X86::CX: DestReg = X86::ECX; break; 14408 case X86::BX: DestReg = X86::EBX; break; 14409 case X86::SI: DestReg = X86::ESI; break; 14410 case X86::DI: DestReg = X86::EDI; break; 14411 case X86::BP: DestReg = X86::EBP; break; 14412 case X86::SP: DestReg = X86::ESP; break; 14413 } 14414 if (DestReg) { 14415 Res.first = DestReg; 14416 Res.second = X86::GR32RegisterClass; 14417 } 14418 } else if (VT == MVT::i64) { 14419 unsigned DestReg = 0; 14420 switch (Res.first) { 14421 default: break; 14422 case X86::AX: DestReg = X86::RAX; break; 14423 case X86::DX: DestReg = X86::RDX; break; 14424 case X86::CX: DestReg = X86::RCX; break; 14425 case X86::BX: DestReg = X86::RBX; break; 14426 case X86::SI: DestReg = X86::RSI; break; 14427 case X86::DI: DestReg = X86::RDI; break; 14428 case X86::BP: DestReg = X86::RBP; break; 14429 case X86::SP: DestReg = X86::RSP; break; 14430 } 14431 if (DestReg) { 14432 Res.first = DestReg; 14433 Res.second = X86::GR64RegisterClass; 14434 } 14435 } 14436 } else if (Res.second == X86::FR32RegisterClass || 14437 Res.second == X86::FR64RegisterClass || 14438 Res.second == X86::VR128RegisterClass) { 14439 // Handle references to XMM physical registers that got mapped into the 14440 // wrong class. This can happen with constraints like {xmm0} where the 14441 // target independent register mapper will just pick the first match it can 14442 // find, ignoring the required type. 14443 if (VT == MVT::f32) 14444 Res.second = X86::FR32RegisterClass; 14445 else if (VT == MVT::f64) 14446 Res.second = X86::FR64RegisterClass; 14447 else if (X86::VR128RegisterClass->hasType(VT)) 14448 Res.second = X86::VR128RegisterClass; 14449 } 14450 14451 return Res; 14452} 14453