X86ISelLowering.cpp revision e321d7ffc5b06152efc4d6e8e1880e08f5c936a8
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 75/// sets things up to match to an AVX VEXTRACTF128 instruction or a 76/// simple subregister reference. Idx is an index in the 128 bits we 77/// want. It need not be aligned to a 128-bit bounday. That makes 78/// lowering EXTRACT_VECTOR_ELT operations easier. 79static SDValue Extract128BitVector(SDValue Vec, 80 SDValue Idx, 81 SelectionDAG &DAG, 82 DebugLoc dl) { 83 EVT VT = Vec.getValueType(); 84 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 85 EVT ElVT = VT.getVectorElementType(); 86 int Factor = VT.getSizeInBits()/128; 87 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 88 VT.getVectorNumElements()/Factor); 89 90 // Extract from UNDEF is UNDEF. 91 if (Vec.getOpcode() == ISD::UNDEF) 92 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 93 94 if (isa<ConstantSDNode>(Idx)) { 95 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 96 97 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 98 // we can match to VEXTRACTF128. 99 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 100 101 // This is the index of the first element of the 128-bit chunk 102 // we want. 103 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 104 * ElemsPerChunk); 105 106 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 108 VecIdx); 109 110 return Result; 111 } 112 113 return SDValue(); 114} 115 116/// Generate a DAG to put 128-bits into a vector > 128 bits. This 117/// sets things up to match to an AVX VINSERTF128 instruction or a 118/// simple superregister reference. Idx is an index in the 128 bits 119/// we want. It need not be aligned to a 128-bit bounday. That makes 120/// lowering INSERT_VECTOR_ELT operations easier. 121static SDValue Insert128BitVector(SDValue Result, 122 SDValue Vec, 123 SDValue Idx, 124 SelectionDAG &DAG, 125 DebugLoc dl) { 126 if (isa<ConstantSDNode>(Idx)) { 127 EVT VT = Vec.getValueType(); 128 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 129 130 EVT ElVT = VT.getVectorElementType(); 131 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 132 EVT ResultVT = Result.getValueType(); 133 134 // Insert the relevant 128 bits. 135 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 136 137 // This is the index of the first element of the 128-bit chunk 138 // we want. 139 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 140 * ElemsPerChunk); 141 142 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 143 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 144 VecIdx); 145 return Result; 146 } 147 148 return SDValue(); 149} 150 151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 152 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 153 bool is64Bit = Subtarget->is64Bit(); 154 155 if (Subtarget->isTargetEnvMacho()) { 156 if (is64Bit) 157 return new X8664_MachoTargetObjectFile(); 158 return new TargetLoweringObjectFileMachO(); 159 } 160 161 if (Subtarget->isTargetELF()) 162 return new TargetLoweringObjectFileELF(); 163 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 164 return new TargetLoweringObjectFileCOFF(); 165 llvm_unreachable("unknown subtarget type"); 166} 167 168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 169 : TargetLowering(TM, createTLOF(TM)) { 170 Subtarget = &TM.getSubtarget<X86Subtarget>(); 171 X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX(); 172 X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX(); 173 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 174 175 RegInfo = TM.getRegisterInfo(); 176 TD = getTargetData(); 177 178 // Set up the TargetLowering object. 179 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 180 181 // X86 is weird, it always uses i8 for shift amounts and setcc results. 182 setBooleanContents(ZeroOrOneBooleanContent); 183 184 // For 64-bit since we have so many registers use the ILP scheduler, for 185 // 32-bit code use the register pressure specific scheduling. 186 if (Subtarget->is64Bit()) 187 setSchedulingPreference(Sched::ILP); 188 else 189 setSchedulingPreference(Sched::RegPressure); 190 setStackPointerRegisterToSaveRestore(X86StackPtr); 191 192 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 193 // Setup Windows compiler runtime calls. 194 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 195 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 196 setLibcallName(RTLIB::SREM_I64, "_allrem"); 197 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 198 setLibcallName(RTLIB::MUL_I64, "_allmul"); 199 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 200 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 201 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 202 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 203 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 207 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 225 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 226 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 257 } else if (!UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!UseSoftFloat) { 315 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 327 if (!X86ScalarSSEf64) { 328 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 329 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 330 if (Subtarget->is64Bit()) { 331 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 332 // Without SSE, i64->f64 goes through memory. 333 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 334 } 335 } 336 337 // Scalar integer divide and remainder are lowered to use operations that 338 // produce two results, to match the available instructions. This exposes 339 // the two-result form to trivial CSE, which is able to combine x/y and x%y 340 // into a single instruction. 341 // 342 // Scalar integer multiply-high is also lowered to use two-result 343 // operations, to match the available instructions. However, plain multiply 344 // (low) operations are left as Legal, as there are single-result 345 // instructions for this in x86. Using the two-result multiply instructions 346 // when both high and low results are needed must be arranged by dagcombine. 347 for (unsigned i = 0, e = 4; i != e; ++i) { 348 MVT VT = IntVTs[i]; 349 setOperationAction(ISD::MULHS, VT, Expand); 350 setOperationAction(ISD::MULHU, VT, Expand); 351 setOperationAction(ISD::SDIV, VT, Expand); 352 setOperationAction(ISD::UDIV, VT, Expand); 353 setOperationAction(ISD::SREM, VT, Expand); 354 setOperationAction(ISD::UREM, VT, Expand); 355 356 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 357 setOperationAction(ISD::ADDC, VT, Custom); 358 setOperationAction(ISD::ADDE, VT, Custom); 359 setOperationAction(ISD::SUBC, VT, Custom); 360 setOperationAction(ISD::SUBE, VT, Custom); 361 } 362 363 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 364 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 365 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 366 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 367 if (Subtarget->is64Bit()) 368 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 372 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 373 setOperationAction(ISD::FREM , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f64 , Expand); 375 setOperationAction(ISD::FREM , MVT::f80 , Expand); 376 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 377 378 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 379 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 380 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 381 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 382 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 383 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 384 if (Subtarget->is64Bit()) { 385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 386 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 387 } 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 401 402 // These should be promoted to a larger select which is supported. 403 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 404 // X86 wants to expand cmov itself. 405 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 406 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 407 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 408 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 409 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 411 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 412 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 414 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 415 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 419 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 420 } 421 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 422 423 // Darwin ABI issue. 424 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 425 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 426 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 427 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 430 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 431 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 432 if (Subtarget->is64Bit()) { 433 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 434 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 435 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 436 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 437 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 438 } 439 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 440 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 441 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 442 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 443 if (Subtarget->is64Bit()) { 444 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 445 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 446 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 447 } 448 449 if (Subtarget->hasXMM()) 450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 451 452 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 453 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 454 455 // On X86 and X86-64, atomic operations are lowered to locked instructions. 456 // Locked instructions, in turn, have implicit fence semantics (all memory 457 // operations are flushed before issuing the locked instruction, and they 458 // are not buffered), so we can fold away the common pattern of 459 // fence-atomic-fence. 460 setShouldFoldAtomicFences(true); 461 462 // Expand certain atomics 463 for (unsigned i = 0, e = 4; i != e; ++i) { 464 MVT VT = IntVTs[i]; 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 466 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 467 } 468 469 if (!Subtarget->is64Bit()) { 470 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 471 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 472 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 473 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 474 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 475 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 476 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 477 } 478 479 // FIXME - use subtarget debug flags 480 if (!Subtarget->isTargetDarwin() && 481 !Subtarget->isTargetELF() && 482 !Subtarget->isTargetCygMing()) { 483 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 484 } 485 486 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 487 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 488 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 489 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 490 if (Subtarget->is64Bit()) { 491 setExceptionPointerRegister(X86::RAX); 492 setExceptionSelectorRegister(X86::RDX); 493 } else { 494 setExceptionPointerRegister(X86::EAX); 495 setExceptionSelectorRegister(X86::EDX); 496 } 497 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 498 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 499 500 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 501 502 setOperationAction(ISD::TRAP, MVT::Other, Legal); 503 504 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 505 setOperationAction(ISD::VASTART , MVT::Other, Custom); 506 setOperationAction(ISD::VAEND , MVT::Other, Expand); 507 if (Subtarget->is64Bit()) { 508 setOperationAction(ISD::VAARG , MVT::Other, Custom); 509 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 510 } else { 511 setOperationAction(ISD::VAARG , MVT::Other, Expand); 512 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 513 } 514 515 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 516 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 517 setOperationAction(ISD::DYNAMIC_STACKALLOC, 518 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 519 (Subtarget->isTargetCOFF() 520 && !Subtarget->isTargetEnvMacho() 521 ? Custom : Expand)); 522 523 if (!UseSoftFloat && X86ScalarSSEf64) { 524 // f32 and f64 use SSE. 525 // Set up the FP register classes. 526 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 527 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 528 529 // Use ANDPD to simulate FABS. 530 setOperationAction(ISD::FABS , MVT::f64, Custom); 531 setOperationAction(ISD::FABS , MVT::f32, Custom); 532 533 // Use XORP to simulate FNEG. 534 setOperationAction(ISD::FNEG , MVT::f64, Custom); 535 setOperationAction(ISD::FNEG , MVT::f32, Custom); 536 537 // Use ANDPD and ORPD to simulate FCOPYSIGN. 538 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 539 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 540 541 // Lower this to FGETSIGNx86 plus an AND. 542 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 543 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 544 545 // We don't support sin/cos/fmod 546 setOperationAction(ISD::FSIN , MVT::f64, Expand); 547 setOperationAction(ISD::FCOS , MVT::f64, Expand); 548 setOperationAction(ISD::FSIN , MVT::f32, Expand); 549 setOperationAction(ISD::FCOS , MVT::f32, Expand); 550 551 // Expand FP immediates into loads from the stack, except for the special 552 // cases we handle. 553 addLegalFPImmediate(APFloat(+0.0)); // xorpd 554 addLegalFPImmediate(APFloat(+0.0f)); // xorps 555 } else if (!UseSoftFloat && X86ScalarSSEf32) { 556 // Use SSE for f32, x87 for f64. 557 // Set up the FP register classes. 558 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 559 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 560 561 // Use ANDPS to simulate FABS. 562 setOperationAction(ISD::FABS , MVT::f32, Custom); 563 564 // Use XORP to simulate FNEG. 565 setOperationAction(ISD::FNEG , MVT::f32, Custom); 566 567 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 568 569 // Use ANDPS and ORPS to simulate FCOPYSIGN. 570 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 571 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 572 573 // We don't support sin/cos/fmod 574 setOperationAction(ISD::FSIN , MVT::f32, Expand); 575 setOperationAction(ISD::FCOS , MVT::f32, Expand); 576 577 // Special cases we handle for FP constants. 578 addLegalFPImmediate(APFloat(+0.0f)); // xorps 579 addLegalFPImmediate(APFloat(+0.0)); // FLD0 580 addLegalFPImmediate(APFloat(+1.0)); // FLD1 581 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 582 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 583 584 if (!UnsafeFPMath) { 585 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 586 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 587 } 588 } else if (!UseSoftFloat) { 589 // f32 and f64 in x87. 590 // Set up the FP register classes. 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 593 594 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 595 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 596 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 597 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 598 599 if (!UnsafeFPMath) { 600 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 601 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 602 } 603 addLegalFPImmediate(APFloat(+0.0)); // FLD0 604 addLegalFPImmediate(APFloat(+1.0)); // FLD1 605 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 606 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 607 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 608 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 609 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 610 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 611 } 612 613 // We don't support FMA. 614 setOperationAction(ISD::FMA, MVT::f64, Expand); 615 setOperationAction(ISD::FMA, MVT::f32, Expand); 616 617 // Long double always uses X87. 618 if (!UseSoftFloat) { 619 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 620 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 621 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 622 { 623 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 624 addLegalFPImmediate(TmpFlt); // FLD0 625 TmpFlt.changeSign(); 626 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 627 628 bool ignored; 629 APFloat TmpFlt2(+1.0); 630 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 631 &ignored); 632 addLegalFPImmediate(TmpFlt2); // FLD1 633 TmpFlt2.changeSign(); 634 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 635 } 636 637 if (!UnsafeFPMath) { 638 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 639 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 640 } 641 642 setOperationAction(ISD::FMA, MVT::f80, Expand); 643 } 644 645 // Always use a library call for pow. 646 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 647 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 648 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 649 650 setOperationAction(ISD::FLOG, MVT::f80, Expand); 651 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 652 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 653 setOperationAction(ISD::FEXP, MVT::f80, Expand); 654 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 655 656 // First set operation action for all vector types to either promote 657 // (for widening) or expand (for scalarization). Then we will selectively 658 // turn on ones that can be effectively codegen'd. 659 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 660 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 661 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 662 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 663 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 664 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 665 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 666 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 667 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 668 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 669 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 670 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 671 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 672 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 673 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 674 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 675 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 676 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 677 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 678 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 679 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 680 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 681 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 682 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 683 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 684 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 685 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 686 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 687 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 702 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 704 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 705 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 711 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 715 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 716 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 717 setTruncStoreAction((MVT::SimpleValueType)VT, 718 (MVT::SimpleValueType)InnerVT, Expand); 719 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 720 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 721 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 722 } 723 724 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 725 // with -msoft-float, disable use of MMX as well. 726 if (!UseSoftFloat && Subtarget->hasMMX()) { 727 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 728 // No operations on x86mmx supported, everything uses intrinsics. 729 } 730 731 // MMX-sized vectors (other than x86mmx) are expected to be expanded 732 // into smaller operations. 733 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 734 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 735 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 736 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 737 setOperationAction(ISD::AND, MVT::v8i8, Expand); 738 setOperationAction(ISD::AND, MVT::v4i16, Expand); 739 setOperationAction(ISD::AND, MVT::v2i32, Expand); 740 setOperationAction(ISD::AND, MVT::v1i64, Expand); 741 setOperationAction(ISD::OR, MVT::v8i8, Expand); 742 setOperationAction(ISD::OR, MVT::v4i16, Expand); 743 setOperationAction(ISD::OR, MVT::v2i32, Expand); 744 setOperationAction(ISD::OR, MVT::v1i64, Expand); 745 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 746 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 747 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 748 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 750 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 751 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 752 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 753 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 754 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 755 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 756 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 757 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 758 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 759 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 760 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 761 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 762 763 if (!UseSoftFloat && Subtarget->hasXMM()) { 764 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 765 766 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 767 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 768 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 769 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 770 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 771 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 772 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 773 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 774 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 775 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 776 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 777 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 778 } 779 780 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 781 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 782 783 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 784 // registers cannot be used even for integer operations. 785 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 786 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 787 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 788 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 789 790 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 791 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 792 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 793 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 794 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 795 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 796 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 797 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 798 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 799 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 800 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 801 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 802 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 803 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 804 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 805 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 806 807 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 808 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 809 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 810 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 811 812 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 813 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 814 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 819 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 820 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 821 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 822 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 823 824 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 825 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 826 EVT VT = (MVT::SimpleValueType)i; 827 // Do not attempt to custom lower non-power-of-2 vectors 828 if (!isPowerOf2_32(VT.getVectorNumElements())) 829 continue; 830 // Do not attempt to custom lower non-128-bit vectors 831 if (!VT.is128BitVector()) 832 continue; 833 setOperationAction(ISD::BUILD_VECTOR, 834 VT.getSimpleVT().SimpleTy, Custom); 835 setOperationAction(ISD::VECTOR_SHUFFLE, 836 VT.getSimpleVT().SimpleTy, Custom); 837 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 838 VT.getSimpleVT().SimpleTy, Custom); 839 } 840 841 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 842 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 843 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 844 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 847 848 if (Subtarget->is64Bit()) { 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 851 } 852 853 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 854 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 855 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 856 EVT VT = SVT; 857 858 // Do not attempt to promote non-128-bit vectors 859 if (!VT.is128BitVector()) 860 continue; 861 862 setOperationAction(ISD::AND, SVT, Promote); 863 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 864 setOperationAction(ISD::OR, SVT, Promote); 865 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 866 setOperationAction(ISD::XOR, SVT, Promote); 867 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 868 setOperationAction(ISD::LOAD, SVT, Promote); 869 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 870 setOperationAction(ISD::SELECT, SVT, Promote); 871 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 872 } 873 874 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 875 876 // Custom lower v2i64 and v2f64 selects. 877 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 878 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 879 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 880 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 881 882 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 883 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 884 } 885 886 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 887 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 888 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 889 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 890 setOperationAction(ISD::FRINT, MVT::f32, Legal); 891 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 892 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 893 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 894 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 895 setOperationAction(ISD::FRINT, MVT::f64, Legal); 896 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 897 898 // FIXME: Do we need to handle scalar-to-vector here? 899 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 900 901 // Can turn SHL into an integer multiply. 902 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 903 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 904 905 // i8 and i16 vectors are custom , because the source register and source 906 // source memory operand types are not the same width. f32 vectors are 907 // custom since the immediate controlling the insert encodes additional 908 // information. 909 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 913 914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 915 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 916 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 917 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 918 919 if (Subtarget->is64Bit()) { 920 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 921 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 922 } 923 } 924 925 if (Subtarget->hasSSE2() || Subtarget->hasAVX()) { 926 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 927 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 928 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 929 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 930 931 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 932 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 933 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 934 935 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 936 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 937 } 938 939 if (Subtarget->hasSSE42() || Subtarget->hasAVX()) 940 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 941 942 if (!UseSoftFloat && Subtarget->hasAVX()) { 943 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 944 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 945 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 946 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 947 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 948 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 949 950 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 951 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 952 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 953 954 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 955 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 956 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 957 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 958 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 959 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 960 961 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 962 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 963 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 964 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 965 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 966 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 967 968 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 969 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 970 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 971 972 // sint_to_fp between different vector types needs custom handling 973 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 974 975 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); 976 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); 977 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 978 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 979 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); 980 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); 981 982 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 983 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 984 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 985 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 986 987 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 988 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 989 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 990 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 991 992 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 993 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 994 995 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 996 setOperationAction(ISD::VSETCC, MVT::v4i64, Custom); 997 998 // Custom lower several nodes for 256-bit types. 999 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1000 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 1001 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1002 EVT VT = SVT; 1003 1004 // Extract subvector is special because the value type 1005 // (result) is 128-bit but the source is 256-bit wide. 1006 if (VT.is128BitVector()) 1007 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 1008 1009 // Do not attempt to custom lower other non-256-bit vectors 1010 if (!VT.is256BitVector()) 1011 continue; 1012 1013 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 1014 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 1015 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 1016 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 1017 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 1018 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 1019 } 1020 1021 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1022 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 1023 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1024 EVT VT = SVT; 1025 1026 // Do not attempt to promote non-256-bit vectors 1027 if (!VT.is256BitVector()) 1028 continue; 1029 1030 setOperationAction(ISD::AND, SVT, Promote); 1031 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1032 setOperationAction(ISD::OR, SVT, Promote); 1033 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1034 setOperationAction(ISD::XOR, SVT, Promote); 1035 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1036 setOperationAction(ISD::LOAD, SVT, Promote); 1037 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1038 setOperationAction(ISD::SELECT, SVT, Promote); 1039 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1040 } 1041 } 1042 1043 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1044 // of this type with custom code. 1045 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1046 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1047 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); 1048 } 1049 1050 // We want to custom lower some of our intrinsics. 1051 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1052 1053 1054 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1055 // handle type legalization for these operations here. 1056 // 1057 // FIXME: We really should do custom legalization for addition and 1058 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1059 // than generic legalization for 64-bit multiplication-with-overflow, though. 1060 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1061 // Add/Sub/Mul with overflow operations are custom lowered. 1062 MVT VT = IntVTs[i]; 1063 setOperationAction(ISD::SADDO, VT, Custom); 1064 setOperationAction(ISD::UADDO, VT, Custom); 1065 setOperationAction(ISD::SSUBO, VT, Custom); 1066 setOperationAction(ISD::USUBO, VT, Custom); 1067 setOperationAction(ISD::SMULO, VT, Custom); 1068 setOperationAction(ISD::UMULO, VT, Custom); 1069 } 1070 1071 // There are no 8-bit 3-address imul/mul instructions 1072 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1073 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1074 1075 if (!Subtarget->is64Bit()) { 1076 // These libcalls are not available in 32-bit. 1077 setLibcallName(RTLIB::SHL_I128, 0); 1078 setLibcallName(RTLIB::SRL_I128, 0); 1079 setLibcallName(RTLIB::SRA_I128, 0); 1080 } 1081 1082 // We have target-specific dag combine patterns for the following nodes: 1083 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1084 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1085 setTargetDAGCombine(ISD::BUILD_VECTOR); 1086 setTargetDAGCombine(ISD::SELECT); 1087 setTargetDAGCombine(ISD::SHL); 1088 setTargetDAGCombine(ISD::SRA); 1089 setTargetDAGCombine(ISD::SRL); 1090 setTargetDAGCombine(ISD::OR); 1091 setTargetDAGCombine(ISD::AND); 1092 setTargetDAGCombine(ISD::ADD); 1093 setTargetDAGCombine(ISD::SUB); 1094 setTargetDAGCombine(ISD::STORE); 1095 setTargetDAGCombine(ISD::ZERO_EXTEND); 1096 setTargetDAGCombine(ISD::SINT_TO_FP); 1097 if (Subtarget->is64Bit()) 1098 setTargetDAGCombine(ISD::MUL); 1099 1100 computeRegisterProperties(); 1101 1102 // On Darwin, -Os means optimize for size without hurting performance, 1103 // do not reduce the limit. 1104 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1105 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1106 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1107 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1108 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1109 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1110 setPrefLoopAlignment(16); 1111 benefitFromCodePlacementOpt = true; 1112 1113 setPrefFunctionAlignment(4); 1114} 1115 1116 1117MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1118 return MVT::i8; 1119} 1120 1121 1122/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1123/// the desired ByVal argument alignment. 1124static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1125 if (MaxAlign == 16) 1126 return; 1127 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1128 if (VTy->getBitWidth() == 128) 1129 MaxAlign = 16; 1130 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1131 unsigned EltAlign = 0; 1132 getMaxByValAlign(ATy->getElementType(), EltAlign); 1133 if (EltAlign > MaxAlign) 1134 MaxAlign = EltAlign; 1135 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1136 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1137 unsigned EltAlign = 0; 1138 getMaxByValAlign(STy->getElementType(i), EltAlign); 1139 if (EltAlign > MaxAlign) 1140 MaxAlign = EltAlign; 1141 if (MaxAlign == 16) 1142 break; 1143 } 1144 } 1145 return; 1146} 1147 1148/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1149/// function arguments in the caller parameter area. For X86, aggregates 1150/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1151/// are at 4-byte boundaries. 1152unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1153 if (Subtarget->is64Bit()) { 1154 // Max of 8 and alignment of type. 1155 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1156 if (TyAlign > 8) 1157 return TyAlign; 1158 return 8; 1159 } 1160 1161 unsigned Align = 4; 1162 if (Subtarget->hasXMM()) 1163 getMaxByValAlign(Ty, Align); 1164 return Align; 1165} 1166 1167/// getOptimalMemOpType - Returns the target specific optimal type for load 1168/// and store operations as a result of memset, memcpy, and memmove 1169/// lowering. If DstAlign is zero that means it's safe to destination 1170/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1171/// means there isn't a need to check it against alignment requirement, 1172/// probably because the source does not need to be loaded. If 1173/// 'NonScalarIntSafe' is true, that means it's safe to return a 1174/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1175/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1176/// constant so it does not need to be loaded. 1177/// It returns EVT::Other if the type should be determined using generic 1178/// target-independent logic. 1179EVT 1180X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1181 unsigned DstAlign, unsigned SrcAlign, 1182 bool NonScalarIntSafe, 1183 bool MemcpyStrSrc, 1184 MachineFunction &MF) const { 1185 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1186 // linux. This is because the stack realignment code can't handle certain 1187 // cases like PR2962. This should be removed when PR2962 is fixed. 1188 const Function *F = MF.getFunction(); 1189 if (NonScalarIntSafe && 1190 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1191 if (Size >= 16 && 1192 (Subtarget->isUnalignedMemAccessFast() || 1193 ((DstAlign == 0 || DstAlign >= 16) && 1194 (SrcAlign == 0 || SrcAlign >= 16))) && 1195 Subtarget->getStackAlignment() >= 16) { 1196 if (Subtarget->hasSSE2()) 1197 return MVT::v4i32; 1198 if (Subtarget->hasSSE1()) 1199 return MVT::v4f32; 1200 } else if (!MemcpyStrSrc && Size >= 8 && 1201 !Subtarget->is64Bit() && 1202 Subtarget->getStackAlignment() >= 8 && 1203 Subtarget->hasXMMInt()) { 1204 // Do not use f64 to lower memcpy if source is string constant. It's 1205 // better to use i32 to avoid the loads. 1206 return MVT::f64; 1207 } 1208 } 1209 if (Subtarget->is64Bit() && Size >= 8) 1210 return MVT::i64; 1211 return MVT::i32; 1212} 1213 1214/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1215/// current function. The returned value is a member of the 1216/// MachineJumpTableInfo::JTEntryKind enum. 1217unsigned X86TargetLowering::getJumpTableEncoding() const { 1218 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1219 // symbol. 1220 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1221 Subtarget->isPICStyleGOT()) 1222 return MachineJumpTableInfo::EK_Custom32; 1223 1224 // Otherwise, use the normal jump table encoding heuristics. 1225 return TargetLowering::getJumpTableEncoding(); 1226} 1227 1228const MCExpr * 1229X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1230 const MachineBasicBlock *MBB, 1231 unsigned uid,MCContext &Ctx) const{ 1232 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1233 Subtarget->isPICStyleGOT()); 1234 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1235 // entries. 1236 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1237 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1238} 1239 1240/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1241/// jumptable. 1242SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1243 SelectionDAG &DAG) const { 1244 if (!Subtarget->is64Bit()) 1245 // This doesn't have DebugLoc associated with it, but is not really the 1246 // same as a Register. 1247 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1248 return Table; 1249} 1250 1251/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1252/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1253/// MCExpr. 1254const MCExpr *X86TargetLowering:: 1255getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1256 MCContext &Ctx) const { 1257 // X86-64 uses RIP relative addressing based on the jump table label. 1258 if (Subtarget->isPICStyleRIPRel()) 1259 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1260 1261 // Otherwise, the reference is relative to the PIC base. 1262 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1263} 1264 1265// FIXME: Why this routine is here? Move to RegInfo! 1266std::pair<const TargetRegisterClass*, uint8_t> 1267X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1268 const TargetRegisterClass *RRC = 0; 1269 uint8_t Cost = 1; 1270 switch (VT.getSimpleVT().SimpleTy) { 1271 default: 1272 return TargetLowering::findRepresentativeClass(VT); 1273 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1274 RRC = (Subtarget->is64Bit() 1275 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1276 break; 1277 case MVT::x86mmx: 1278 RRC = X86::VR64RegisterClass; 1279 break; 1280 case MVT::f32: case MVT::f64: 1281 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1282 case MVT::v4f32: case MVT::v2f64: 1283 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1284 case MVT::v4f64: 1285 RRC = X86::VR128RegisterClass; 1286 break; 1287 } 1288 return std::make_pair(RRC, Cost); 1289} 1290 1291bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1292 unsigned &Offset) const { 1293 if (!Subtarget->isTargetLinux()) 1294 return false; 1295 1296 if (Subtarget->is64Bit()) { 1297 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1298 Offset = 0x28; 1299 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1300 AddressSpace = 256; 1301 else 1302 AddressSpace = 257; 1303 } else { 1304 // %gs:0x14 on i386 1305 Offset = 0x14; 1306 AddressSpace = 256; 1307 } 1308 return true; 1309} 1310 1311 1312//===----------------------------------------------------------------------===// 1313// Return Value Calling Convention Implementation 1314//===----------------------------------------------------------------------===// 1315 1316#include "X86GenCallingConv.inc" 1317 1318bool 1319X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1320 MachineFunction &MF, bool isVarArg, 1321 const SmallVectorImpl<ISD::OutputArg> &Outs, 1322 LLVMContext &Context) const { 1323 SmallVector<CCValAssign, 16> RVLocs; 1324 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1325 RVLocs, Context); 1326 return CCInfo.CheckReturn(Outs, RetCC_X86); 1327} 1328 1329SDValue 1330X86TargetLowering::LowerReturn(SDValue Chain, 1331 CallingConv::ID CallConv, bool isVarArg, 1332 const SmallVectorImpl<ISD::OutputArg> &Outs, 1333 const SmallVectorImpl<SDValue> &OutVals, 1334 DebugLoc dl, SelectionDAG &DAG) const { 1335 MachineFunction &MF = DAG.getMachineFunction(); 1336 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1337 1338 SmallVector<CCValAssign, 16> RVLocs; 1339 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1340 RVLocs, *DAG.getContext()); 1341 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1342 1343 // Add the regs to the liveout set for the function. 1344 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1345 for (unsigned i = 0; i != RVLocs.size(); ++i) 1346 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1347 MRI.addLiveOut(RVLocs[i].getLocReg()); 1348 1349 SDValue Flag; 1350 1351 SmallVector<SDValue, 6> RetOps; 1352 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1353 // Operand #1 = Bytes To Pop 1354 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1355 MVT::i16)); 1356 1357 // Copy the result values into the output registers. 1358 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1359 CCValAssign &VA = RVLocs[i]; 1360 assert(VA.isRegLoc() && "Can only return in registers!"); 1361 SDValue ValToCopy = OutVals[i]; 1362 EVT ValVT = ValToCopy.getValueType(); 1363 1364 // If this is x86-64, and we disabled SSE, we can't return FP values, 1365 // or SSE or MMX vectors. 1366 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1367 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1368 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1369 report_fatal_error("SSE register return with SSE disabled"); 1370 } 1371 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1372 // llvm-gcc has never done it right and no one has noticed, so this 1373 // should be OK for now. 1374 if (ValVT == MVT::f64 && 1375 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1376 report_fatal_error("SSE2 register return with SSE2 disabled"); 1377 1378 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1379 // the RET instruction and handled by the FP Stackifier. 1380 if (VA.getLocReg() == X86::ST0 || 1381 VA.getLocReg() == X86::ST1) { 1382 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1383 // change the value to the FP stack register class. 1384 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1385 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1386 RetOps.push_back(ValToCopy); 1387 // Don't emit a copytoreg. 1388 continue; 1389 } 1390 1391 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1392 // which is returned in RAX / RDX. 1393 if (Subtarget->is64Bit()) { 1394 if (ValVT == MVT::x86mmx) { 1395 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1396 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1397 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1398 ValToCopy); 1399 // If we don't have SSE2 available, convert to v4f32 so the generated 1400 // register is legal. 1401 if (!Subtarget->hasSSE2()) 1402 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1403 } 1404 } 1405 } 1406 1407 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1408 Flag = Chain.getValue(1); 1409 } 1410 1411 // The x86-64 ABI for returning structs by value requires that we copy 1412 // the sret argument into %rax for the return. We saved the argument into 1413 // a virtual register in the entry block, so now we copy the value out 1414 // and into %rax. 1415 if (Subtarget->is64Bit() && 1416 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1417 MachineFunction &MF = DAG.getMachineFunction(); 1418 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1419 unsigned Reg = FuncInfo->getSRetReturnReg(); 1420 assert(Reg && 1421 "SRetReturnReg should have been set in LowerFormalArguments()."); 1422 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1423 1424 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1425 Flag = Chain.getValue(1); 1426 1427 // RAX now acts like a return value. 1428 MRI.addLiveOut(X86::RAX); 1429 } 1430 1431 RetOps[0] = Chain; // Update chain. 1432 1433 // Add the flag if we have it. 1434 if (Flag.getNode()) 1435 RetOps.push_back(Flag); 1436 1437 return DAG.getNode(X86ISD::RET_FLAG, dl, 1438 MVT::Other, &RetOps[0], RetOps.size()); 1439} 1440 1441bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1442 if (N->getNumValues() != 1) 1443 return false; 1444 if (!N->hasNUsesOfValue(1, 0)) 1445 return false; 1446 1447 SDNode *Copy = *N->use_begin(); 1448 if (Copy->getOpcode() != ISD::CopyToReg && 1449 Copy->getOpcode() != ISD::FP_EXTEND) 1450 return false; 1451 1452 bool HasRet = false; 1453 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1454 UI != UE; ++UI) { 1455 if (UI->getOpcode() != X86ISD::RET_FLAG) 1456 return false; 1457 HasRet = true; 1458 } 1459 1460 return HasRet; 1461} 1462 1463EVT 1464X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1465 ISD::NodeType ExtendKind) const { 1466 MVT ReturnMVT; 1467 // TODO: Is this also valid on 32-bit? 1468 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1469 ReturnMVT = MVT::i8; 1470 else 1471 ReturnMVT = MVT::i32; 1472 1473 EVT MinVT = getRegisterType(Context, ReturnMVT); 1474 return VT.bitsLT(MinVT) ? MinVT : VT; 1475} 1476 1477/// LowerCallResult - Lower the result values of a call into the 1478/// appropriate copies out of appropriate physical registers. 1479/// 1480SDValue 1481X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1482 CallingConv::ID CallConv, bool isVarArg, 1483 const SmallVectorImpl<ISD::InputArg> &Ins, 1484 DebugLoc dl, SelectionDAG &DAG, 1485 SmallVectorImpl<SDValue> &InVals) const { 1486 1487 // Assign locations to each value returned by this call. 1488 SmallVector<CCValAssign, 16> RVLocs; 1489 bool Is64Bit = Subtarget->is64Bit(); 1490 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1491 getTargetMachine(), RVLocs, *DAG.getContext()); 1492 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1493 1494 // Copy all of the result registers out of their specified physreg. 1495 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1496 CCValAssign &VA = RVLocs[i]; 1497 EVT CopyVT = VA.getValVT(); 1498 1499 // If this is x86-64, and we disabled SSE, we can't return FP values 1500 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1501 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1502 report_fatal_error("SSE register return with SSE disabled"); 1503 } 1504 1505 SDValue Val; 1506 1507 // If this is a call to a function that returns an fp value on the floating 1508 // point stack, we must guarantee the the value is popped from the stack, so 1509 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1510 // if the return value is not used. We use the FpPOP_RETVAL instruction 1511 // instead. 1512 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1513 // If we prefer to use the value in xmm registers, copy it out as f80 and 1514 // use a truncate to move it from fp stack reg to xmm reg. 1515 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1516 SDValue Ops[] = { Chain, InFlag }; 1517 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1518 MVT::Other, MVT::Glue, Ops, 2), 1); 1519 Val = Chain.getValue(0); 1520 1521 // Round the f80 to the right size, which also moves it to the appropriate 1522 // xmm register. 1523 if (CopyVT != VA.getValVT()) 1524 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1525 // This truncation won't change the value. 1526 DAG.getIntPtrConstant(1)); 1527 } else { 1528 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1529 CopyVT, InFlag).getValue(1); 1530 Val = Chain.getValue(0); 1531 } 1532 InFlag = Chain.getValue(2); 1533 InVals.push_back(Val); 1534 } 1535 1536 return Chain; 1537} 1538 1539 1540//===----------------------------------------------------------------------===// 1541// C & StdCall & Fast Calling Convention implementation 1542//===----------------------------------------------------------------------===// 1543// StdCall calling convention seems to be standard for many Windows' API 1544// routines and around. It differs from C calling convention just a little: 1545// callee should clean up the stack, not caller. Symbols should be also 1546// decorated in some fancy way :) It doesn't support any vector arguments. 1547// For info on fast calling convention see Fast Calling Convention (tail call) 1548// implementation LowerX86_32FastCCCallTo. 1549 1550/// CallIsStructReturn - Determines whether a call uses struct return 1551/// semantics. 1552static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1553 if (Outs.empty()) 1554 return false; 1555 1556 return Outs[0].Flags.isSRet(); 1557} 1558 1559/// ArgsAreStructReturn - Determines whether a function uses struct 1560/// return semantics. 1561static bool 1562ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1563 if (Ins.empty()) 1564 return false; 1565 1566 return Ins[0].Flags.isSRet(); 1567} 1568 1569/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1570/// by "Src" to address "Dst" with size and alignment information specified by 1571/// the specific parameter attribute. The copy will be passed as a byval 1572/// function parameter. 1573static SDValue 1574CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1575 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1576 DebugLoc dl) { 1577 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1578 1579 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1580 /*isVolatile*/false, /*AlwaysInline=*/true, 1581 MachinePointerInfo(), MachinePointerInfo()); 1582} 1583 1584/// IsTailCallConvention - Return true if the calling convention is one that 1585/// supports tail call optimization. 1586static bool IsTailCallConvention(CallingConv::ID CC) { 1587 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1588} 1589 1590bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1591 if (!CI->isTailCall()) 1592 return false; 1593 1594 CallSite CS(CI); 1595 CallingConv::ID CalleeCC = CS.getCallingConv(); 1596 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1597 return false; 1598 1599 return true; 1600} 1601 1602/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1603/// a tailcall target by changing its ABI. 1604static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1605 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1606} 1607 1608SDValue 1609X86TargetLowering::LowerMemArgument(SDValue Chain, 1610 CallingConv::ID CallConv, 1611 const SmallVectorImpl<ISD::InputArg> &Ins, 1612 DebugLoc dl, SelectionDAG &DAG, 1613 const CCValAssign &VA, 1614 MachineFrameInfo *MFI, 1615 unsigned i) const { 1616 // Create the nodes corresponding to a load from this parameter slot. 1617 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1618 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1619 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1620 EVT ValVT; 1621 1622 // If value is passed by pointer we have address passed instead of the value 1623 // itself. 1624 if (VA.getLocInfo() == CCValAssign::Indirect) 1625 ValVT = VA.getLocVT(); 1626 else 1627 ValVT = VA.getValVT(); 1628 1629 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1630 // changed with more analysis. 1631 // In case of tail call optimization mark all arguments mutable. Since they 1632 // could be overwritten by lowering of arguments in case of a tail call. 1633 if (Flags.isByVal()) { 1634 unsigned Bytes = Flags.getByValSize(); 1635 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1636 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1637 return DAG.getFrameIndex(FI, getPointerTy()); 1638 } else { 1639 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1640 VA.getLocMemOffset(), isImmutable); 1641 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1642 return DAG.getLoad(ValVT, dl, Chain, FIN, 1643 MachinePointerInfo::getFixedStack(FI), 1644 false, false, 0); 1645 } 1646} 1647 1648SDValue 1649X86TargetLowering::LowerFormalArguments(SDValue Chain, 1650 CallingConv::ID CallConv, 1651 bool isVarArg, 1652 const SmallVectorImpl<ISD::InputArg> &Ins, 1653 DebugLoc dl, 1654 SelectionDAG &DAG, 1655 SmallVectorImpl<SDValue> &InVals) 1656 const { 1657 MachineFunction &MF = DAG.getMachineFunction(); 1658 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1659 1660 const Function* Fn = MF.getFunction(); 1661 if (Fn->hasExternalLinkage() && 1662 Subtarget->isTargetCygMing() && 1663 Fn->getName() == "main") 1664 FuncInfo->setForceFramePointer(true); 1665 1666 MachineFrameInfo *MFI = MF.getFrameInfo(); 1667 bool Is64Bit = Subtarget->is64Bit(); 1668 bool IsWin64 = Subtarget->isTargetWin64(); 1669 1670 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1671 "Var args not supported with calling convention fastcc or ghc"); 1672 1673 // Assign locations to all of the incoming arguments. 1674 SmallVector<CCValAssign, 16> ArgLocs; 1675 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1676 ArgLocs, *DAG.getContext()); 1677 1678 // Allocate shadow area for Win64 1679 if (IsWin64) { 1680 CCInfo.AllocateStack(32, 8); 1681 } 1682 1683 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1684 1685 unsigned LastVal = ~0U; 1686 SDValue ArgValue; 1687 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1688 CCValAssign &VA = ArgLocs[i]; 1689 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1690 // places. 1691 assert(VA.getValNo() != LastVal && 1692 "Don't support value assigned to multiple locs yet"); 1693 LastVal = VA.getValNo(); 1694 1695 if (VA.isRegLoc()) { 1696 EVT RegVT = VA.getLocVT(); 1697 TargetRegisterClass *RC = NULL; 1698 if (RegVT == MVT::i32) 1699 RC = X86::GR32RegisterClass; 1700 else if (Is64Bit && RegVT == MVT::i64) 1701 RC = X86::GR64RegisterClass; 1702 else if (RegVT == MVT::f32) 1703 RC = X86::FR32RegisterClass; 1704 else if (RegVT == MVT::f64) 1705 RC = X86::FR64RegisterClass; 1706 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1707 RC = X86::VR256RegisterClass; 1708 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1709 RC = X86::VR128RegisterClass; 1710 else if (RegVT == MVT::x86mmx) 1711 RC = X86::VR64RegisterClass; 1712 else 1713 llvm_unreachable("Unknown argument type!"); 1714 1715 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1716 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1717 1718 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1719 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1720 // right size. 1721 if (VA.getLocInfo() == CCValAssign::SExt) 1722 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1723 DAG.getValueType(VA.getValVT())); 1724 else if (VA.getLocInfo() == CCValAssign::ZExt) 1725 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1726 DAG.getValueType(VA.getValVT())); 1727 else if (VA.getLocInfo() == CCValAssign::BCvt) 1728 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1729 1730 if (VA.isExtInLoc()) { 1731 // Handle MMX values passed in XMM regs. 1732 if (RegVT.isVector()) { 1733 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1734 ArgValue); 1735 } else 1736 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1737 } 1738 } else { 1739 assert(VA.isMemLoc()); 1740 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1741 } 1742 1743 // If value is passed via pointer - do a load. 1744 if (VA.getLocInfo() == CCValAssign::Indirect) 1745 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1746 MachinePointerInfo(), false, false, 0); 1747 1748 InVals.push_back(ArgValue); 1749 } 1750 1751 // The x86-64 ABI for returning structs by value requires that we copy 1752 // the sret argument into %rax for the return. Save the argument into 1753 // a virtual register so that we can access it from the return points. 1754 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1755 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1756 unsigned Reg = FuncInfo->getSRetReturnReg(); 1757 if (!Reg) { 1758 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1759 FuncInfo->setSRetReturnReg(Reg); 1760 } 1761 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1762 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1763 } 1764 1765 unsigned StackSize = CCInfo.getNextStackOffset(); 1766 // Align stack specially for tail calls. 1767 if (FuncIsMadeTailCallSafe(CallConv)) 1768 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1769 1770 // If the function takes variable number of arguments, make a frame index for 1771 // the start of the first vararg value... for expansion of llvm.va_start. 1772 if (isVarArg) { 1773 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1774 CallConv != CallingConv::X86_ThisCall)) { 1775 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1776 } 1777 if (Is64Bit) { 1778 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1779 1780 // FIXME: We should really autogenerate these arrays 1781 static const unsigned GPR64ArgRegsWin64[] = { 1782 X86::RCX, X86::RDX, X86::R8, X86::R9 1783 }; 1784 static const unsigned GPR64ArgRegs64Bit[] = { 1785 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1786 }; 1787 static const unsigned XMMArgRegs64Bit[] = { 1788 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1789 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1790 }; 1791 const unsigned *GPR64ArgRegs; 1792 unsigned NumXMMRegs = 0; 1793 1794 if (IsWin64) { 1795 // The XMM registers which might contain var arg parameters are shadowed 1796 // in their paired GPR. So we only need to save the GPR to their home 1797 // slots. 1798 TotalNumIntRegs = 4; 1799 GPR64ArgRegs = GPR64ArgRegsWin64; 1800 } else { 1801 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1802 GPR64ArgRegs = GPR64ArgRegs64Bit; 1803 1804 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1805 } 1806 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1807 TotalNumIntRegs); 1808 1809 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1810 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1811 "SSE register cannot be used when SSE is disabled!"); 1812 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1813 "SSE register cannot be used when SSE is disabled!"); 1814 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1815 // Kernel mode asks for SSE to be disabled, so don't push them 1816 // on the stack. 1817 TotalNumXMMRegs = 0; 1818 1819 if (IsWin64) { 1820 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1821 // Get to the caller-allocated home save location. Add 8 to account 1822 // for the return address. 1823 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1824 FuncInfo->setRegSaveFrameIndex( 1825 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1826 // Fixup to set vararg frame on shadow area (4 x i64). 1827 if (NumIntRegs < 4) 1828 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1829 } else { 1830 // For X86-64, if there are vararg parameters that are passed via 1831 // registers, then we must store them to their spots on the stack so they 1832 // may be loaded by deferencing the result of va_next. 1833 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1834 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1835 FuncInfo->setRegSaveFrameIndex( 1836 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1837 false)); 1838 } 1839 1840 // Store the integer parameter registers. 1841 SmallVector<SDValue, 8> MemOps; 1842 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1843 getPointerTy()); 1844 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1845 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1846 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1847 DAG.getIntPtrConstant(Offset)); 1848 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1849 X86::GR64RegisterClass); 1850 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1851 SDValue Store = 1852 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1853 MachinePointerInfo::getFixedStack( 1854 FuncInfo->getRegSaveFrameIndex(), Offset), 1855 false, false, 0); 1856 MemOps.push_back(Store); 1857 Offset += 8; 1858 } 1859 1860 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1861 // Now store the XMM (fp + vector) parameter registers. 1862 SmallVector<SDValue, 11> SaveXMMOps; 1863 SaveXMMOps.push_back(Chain); 1864 1865 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1866 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1867 SaveXMMOps.push_back(ALVal); 1868 1869 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1870 FuncInfo->getRegSaveFrameIndex())); 1871 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1872 FuncInfo->getVarArgsFPOffset())); 1873 1874 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1875 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1876 X86::VR128RegisterClass); 1877 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1878 SaveXMMOps.push_back(Val); 1879 } 1880 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1881 MVT::Other, 1882 &SaveXMMOps[0], SaveXMMOps.size())); 1883 } 1884 1885 if (!MemOps.empty()) 1886 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1887 &MemOps[0], MemOps.size()); 1888 } 1889 } 1890 1891 // Some CCs need callee pop. 1892 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { 1893 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1894 } else { 1895 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1896 // If this is an sret function, the return should pop the hidden pointer. 1897 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1898 FuncInfo->setBytesToPopOnReturn(4); 1899 } 1900 1901 if (!Is64Bit) { 1902 // RegSaveFrameIndex is X86-64 only. 1903 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1904 if (CallConv == CallingConv::X86_FastCall || 1905 CallConv == CallingConv::X86_ThisCall) 1906 // fastcc functions can't have varargs. 1907 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1908 } 1909 1910 return Chain; 1911} 1912 1913SDValue 1914X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1915 SDValue StackPtr, SDValue Arg, 1916 DebugLoc dl, SelectionDAG &DAG, 1917 const CCValAssign &VA, 1918 ISD::ArgFlagsTy Flags) const { 1919 unsigned LocMemOffset = VA.getLocMemOffset(); 1920 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1921 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1922 if (Flags.isByVal()) 1923 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1924 1925 return DAG.getStore(Chain, dl, Arg, PtrOff, 1926 MachinePointerInfo::getStack(LocMemOffset), 1927 false, false, 0); 1928} 1929 1930/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1931/// optimization is performed and it is required. 1932SDValue 1933X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1934 SDValue &OutRetAddr, SDValue Chain, 1935 bool IsTailCall, bool Is64Bit, 1936 int FPDiff, DebugLoc dl) const { 1937 // Adjust the Return address stack slot. 1938 EVT VT = getPointerTy(); 1939 OutRetAddr = getReturnAddressFrameIndex(DAG); 1940 1941 // Load the "old" Return address. 1942 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1943 false, false, 0); 1944 return SDValue(OutRetAddr.getNode(), 1); 1945} 1946 1947/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1948/// optimization is performed and it is required (FPDiff!=0). 1949static SDValue 1950EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1951 SDValue Chain, SDValue RetAddrFrIdx, 1952 bool Is64Bit, int FPDiff, DebugLoc dl) { 1953 // Store the return address to the appropriate stack slot. 1954 if (!FPDiff) return Chain; 1955 // Calculate the new stack slot for the return address. 1956 int SlotSize = Is64Bit ? 8 : 4; 1957 int NewReturnAddrFI = 1958 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1959 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1960 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1961 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1962 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1963 false, false, 0); 1964 return Chain; 1965} 1966 1967SDValue 1968X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1969 CallingConv::ID CallConv, bool isVarArg, 1970 bool &isTailCall, 1971 const SmallVectorImpl<ISD::OutputArg> &Outs, 1972 const SmallVectorImpl<SDValue> &OutVals, 1973 const SmallVectorImpl<ISD::InputArg> &Ins, 1974 DebugLoc dl, SelectionDAG &DAG, 1975 SmallVectorImpl<SDValue> &InVals) const { 1976 MachineFunction &MF = DAG.getMachineFunction(); 1977 bool Is64Bit = Subtarget->is64Bit(); 1978 bool IsWin64 = Subtarget->isTargetWin64(); 1979 bool IsStructRet = CallIsStructReturn(Outs); 1980 bool IsSibcall = false; 1981 1982 if (isTailCall) { 1983 // Check if it's really possible to do a tail call. 1984 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1985 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1986 Outs, OutVals, Ins, DAG); 1987 1988 // Sibcalls are automatically detected tailcalls which do not require 1989 // ABI changes. 1990 if (!GuaranteedTailCallOpt && isTailCall) 1991 IsSibcall = true; 1992 1993 if (isTailCall) 1994 ++NumTailCalls; 1995 } 1996 1997 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1998 "Var args not supported with calling convention fastcc or ghc"); 1999 2000 // Analyze operands of the call, assigning locations to each operand. 2001 SmallVector<CCValAssign, 16> ArgLocs; 2002 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2003 ArgLocs, *DAG.getContext()); 2004 2005 // Allocate shadow area for Win64 2006 if (IsWin64) { 2007 CCInfo.AllocateStack(32, 8); 2008 } 2009 2010 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2011 2012 // Get a count of how many bytes are to be pushed on the stack. 2013 unsigned NumBytes = CCInfo.getNextStackOffset(); 2014 if (IsSibcall) 2015 // This is a sibcall. The memory operands are available in caller's 2016 // own caller's stack. 2017 NumBytes = 0; 2018 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 2019 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2020 2021 int FPDiff = 0; 2022 if (isTailCall && !IsSibcall) { 2023 // Lower arguments at fp - stackoffset + fpdiff. 2024 unsigned NumBytesCallerPushed = 2025 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2026 FPDiff = NumBytesCallerPushed - NumBytes; 2027 2028 // Set the delta of movement of the returnaddr stackslot. 2029 // But only set if delta is greater than previous delta. 2030 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2031 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2032 } 2033 2034 if (!IsSibcall) 2035 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2036 2037 SDValue RetAddrFrIdx; 2038 // Load return address for tail calls. 2039 if (isTailCall && FPDiff) 2040 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2041 Is64Bit, FPDiff, dl); 2042 2043 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2044 SmallVector<SDValue, 8> MemOpChains; 2045 SDValue StackPtr; 2046 2047 // Walk the register/memloc assignments, inserting copies/loads. In the case 2048 // of tail call optimization arguments are handle later. 2049 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2050 CCValAssign &VA = ArgLocs[i]; 2051 EVT RegVT = VA.getLocVT(); 2052 SDValue Arg = OutVals[i]; 2053 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2054 bool isByVal = Flags.isByVal(); 2055 2056 // Promote the value if needed. 2057 switch (VA.getLocInfo()) { 2058 default: llvm_unreachable("Unknown loc info!"); 2059 case CCValAssign::Full: break; 2060 case CCValAssign::SExt: 2061 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2062 break; 2063 case CCValAssign::ZExt: 2064 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2065 break; 2066 case CCValAssign::AExt: 2067 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2068 // Special case: passing MMX values in XMM registers. 2069 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2070 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2071 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2072 } else 2073 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2074 break; 2075 case CCValAssign::BCvt: 2076 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2077 break; 2078 case CCValAssign::Indirect: { 2079 // Store the argument. 2080 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2081 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2082 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2083 MachinePointerInfo::getFixedStack(FI), 2084 false, false, 0); 2085 Arg = SpillSlot; 2086 break; 2087 } 2088 } 2089 2090 if (VA.isRegLoc()) { 2091 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2092 if (isVarArg && IsWin64) { 2093 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2094 // shadow reg if callee is a varargs function. 2095 unsigned ShadowReg = 0; 2096 switch (VA.getLocReg()) { 2097 case X86::XMM0: ShadowReg = X86::RCX; break; 2098 case X86::XMM1: ShadowReg = X86::RDX; break; 2099 case X86::XMM2: ShadowReg = X86::R8; break; 2100 case X86::XMM3: ShadowReg = X86::R9; break; 2101 } 2102 if (ShadowReg) 2103 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2104 } 2105 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2106 assert(VA.isMemLoc()); 2107 if (StackPtr.getNode() == 0) 2108 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2109 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2110 dl, DAG, VA, Flags)); 2111 } 2112 } 2113 2114 if (!MemOpChains.empty()) 2115 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2116 &MemOpChains[0], MemOpChains.size()); 2117 2118 // Build a sequence of copy-to-reg nodes chained together with token chain 2119 // and flag operands which copy the outgoing args into registers. 2120 SDValue InFlag; 2121 // Tail call byval lowering might overwrite argument registers so in case of 2122 // tail call optimization the copies to registers are lowered later. 2123 if (!isTailCall) 2124 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2125 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2126 RegsToPass[i].second, InFlag); 2127 InFlag = Chain.getValue(1); 2128 } 2129 2130 if (Subtarget->isPICStyleGOT()) { 2131 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2132 // GOT pointer. 2133 if (!isTailCall) { 2134 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2135 DAG.getNode(X86ISD::GlobalBaseReg, 2136 DebugLoc(), getPointerTy()), 2137 InFlag); 2138 InFlag = Chain.getValue(1); 2139 } else { 2140 // If we are tail calling and generating PIC/GOT style code load the 2141 // address of the callee into ECX. The value in ecx is used as target of 2142 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2143 // for tail calls on PIC/GOT architectures. Normally we would just put the 2144 // address of GOT into ebx and then call target@PLT. But for tail calls 2145 // ebx would be restored (since ebx is callee saved) before jumping to the 2146 // target@PLT. 2147 2148 // Note: The actual moving to ECX is done further down. 2149 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2150 if (G && !G->getGlobal()->hasHiddenVisibility() && 2151 !G->getGlobal()->hasProtectedVisibility()) 2152 Callee = LowerGlobalAddress(Callee, DAG); 2153 else if (isa<ExternalSymbolSDNode>(Callee)) 2154 Callee = LowerExternalSymbol(Callee, DAG); 2155 } 2156 } 2157 2158 if (Is64Bit && isVarArg && !IsWin64) { 2159 // From AMD64 ABI document: 2160 // For calls that may call functions that use varargs or stdargs 2161 // (prototype-less calls or calls to functions containing ellipsis (...) in 2162 // the declaration) %al is used as hidden argument to specify the number 2163 // of SSE registers used. The contents of %al do not need to match exactly 2164 // the number of registers, but must be an ubound on the number of SSE 2165 // registers used and is in the range 0 - 8 inclusive. 2166 2167 // Count the number of XMM registers allocated. 2168 static const unsigned XMMArgRegs[] = { 2169 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2170 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2171 }; 2172 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2173 assert((Subtarget->hasXMM() || !NumXMMRegs) 2174 && "SSE registers cannot be used when SSE is disabled"); 2175 2176 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2177 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2178 InFlag = Chain.getValue(1); 2179 } 2180 2181 2182 // For tail calls lower the arguments to the 'real' stack slot. 2183 if (isTailCall) { 2184 // Force all the incoming stack arguments to be loaded from the stack 2185 // before any new outgoing arguments are stored to the stack, because the 2186 // outgoing stack slots may alias the incoming argument stack slots, and 2187 // the alias isn't otherwise explicit. This is slightly more conservative 2188 // than necessary, because it means that each store effectively depends 2189 // on every argument instead of just those arguments it would clobber. 2190 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2191 2192 SmallVector<SDValue, 8> MemOpChains2; 2193 SDValue FIN; 2194 int FI = 0; 2195 // Do not flag preceding copytoreg stuff together with the following stuff. 2196 InFlag = SDValue(); 2197 if (GuaranteedTailCallOpt) { 2198 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2199 CCValAssign &VA = ArgLocs[i]; 2200 if (VA.isRegLoc()) 2201 continue; 2202 assert(VA.isMemLoc()); 2203 SDValue Arg = OutVals[i]; 2204 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2205 // Create frame index. 2206 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2207 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2208 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2209 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2210 2211 if (Flags.isByVal()) { 2212 // Copy relative to framepointer. 2213 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2214 if (StackPtr.getNode() == 0) 2215 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2216 getPointerTy()); 2217 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2218 2219 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2220 ArgChain, 2221 Flags, DAG, dl)); 2222 } else { 2223 // Store relative to framepointer. 2224 MemOpChains2.push_back( 2225 DAG.getStore(ArgChain, dl, Arg, FIN, 2226 MachinePointerInfo::getFixedStack(FI), 2227 false, false, 0)); 2228 } 2229 } 2230 } 2231 2232 if (!MemOpChains2.empty()) 2233 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2234 &MemOpChains2[0], MemOpChains2.size()); 2235 2236 // Copy arguments to their registers. 2237 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2238 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2239 RegsToPass[i].second, InFlag); 2240 InFlag = Chain.getValue(1); 2241 } 2242 InFlag =SDValue(); 2243 2244 // Store the return address to the appropriate stack slot. 2245 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2246 FPDiff, dl); 2247 } 2248 2249 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2250 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2251 // In the 64-bit large code model, we have to make all calls 2252 // through a register, since the call instruction's 32-bit 2253 // pc-relative offset may not be large enough to hold the whole 2254 // address. 2255 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2256 // If the callee is a GlobalAddress node (quite common, every direct call 2257 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2258 // it. 2259 2260 // We should use extra load for direct calls to dllimported functions in 2261 // non-JIT mode. 2262 const GlobalValue *GV = G->getGlobal(); 2263 if (!GV->hasDLLImportLinkage()) { 2264 unsigned char OpFlags = 0; 2265 bool ExtraLoad = false; 2266 unsigned WrapperKind = ISD::DELETED_NODE; 2267 2268 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2269 // external symbols most go through the PLT in PIC mode. If the symbol 2270 // has hidden or protected visibility, or if it is static or local, then 2271 // we don't need to use the PLT - we can directly call it. 2272 if (Subtarget->isTargetELF() && 2273 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2274 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2275 OpFlags = X86II::MO_PLT; 2276 } else if (Subtarget->isPICStyleStubAny() && 2277 (GV->isDeclaration() || GV->isWeakForLinker()) && 2278 (!Subtarget->getTargetTriple().isMacOSX() || 2279 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2280 // PC-relative references to external symbols should go through $stub, 2281 // unless we're building with the leopard linker or later, which 2282 // automatically synthesizes these stubs. 2283 OpFlags = X86II::MO_DARWIN_STUB; 2284 } else if (Subtarget->isPICStyleRIPRel() && 2285 isa<Function>(GV) && 2286 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2287 // If the function is marked as non-lazy, generate an indirect call 2288 // which loads from the GOT directly. This avoids runtime overhead 2289 // at the cost of eager binding (and one extra byte of encoding). 2290 OpFlags = X86II::MO_GOTPCREL; 2291 WrapperKind = X86ISD::WrapperRIP; 2292 ExtraLoad = true; 2293 } 2294 2295 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2296 G->getOffset(), OpFlags); 2297 2298 // Add a wrapper if needed. 2299 if (WrapperKind != ISD::DELETED_NODE) 2300 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2301 // Add extra indirection if needed. 2302 if (ExtraLoad) 2303 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2304 MachinePointerInfo::getGOT(), 2305 false, false, 0); 2306 } 2307 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2308 unsigned char OpFlags = 0; 2309 2310 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2311 // external symbols should go through the PLT. 2312 if (Subtarget->isTargetELF() && 2313 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2314 OpFlags = X86II::MO_PLT; 2315 } else if (Subtarget->isPICStyleStubAny() && 2316 (!Subtarget->getTargetTriple().isMacOSX() || 2317 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2318 // PC-relative references to external symbols should go through $stub, 2319 // unless we're building with the leopard linker or later, which 2320 // automatically synthesizes these stubs. 2321 OpFlags = X86II::MO_DARWIN_STUB; 2322 } 2323 2324 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2325 OpFlags); 2326 } 2327 2328 // Returns a chain & a flag for retval copy to use. 2329 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2330 SmallVector<SDValue, 8> Ops; 2331 2332 if (!IsSibcall && isTailCall) { 2333 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2334 DAG.getIntPtrConstant(0, true), InFlag); 2335 InFlag = Chain.getValue(1); 2336 } 2337 2338 Ops.push_back(Chain); 2339 Ops.push_back(Callee); 2340 2341 if (isTailCall) 2342 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2343 2344 // Add argument registers to the end of the list so that they are known live 2345 // into the call. 2346 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2347 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2348 RegsToPass[i].second.getValueType())); 2349 2350 // Add an implicit use GOT pointer in EBX. 2351 if (!isTailCall && Subtarget->isPICStyleGOT()) 2352 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2353 2354 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2355 if (Is64Bit && isVarArg && !IsWin64) 2356 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2357 2358 if (InFlag.getNode()) 2359 Ops.push_back(InFlag); 2360 2361 if (isTailCall) { 2362 // We used to do: 2363 //// If this is the first return lowered for this function, add the regs 2364 //// to the liveout set for the function. 2365 // This isn't right, although it's probably harmless on x86; liveouts 2366 // should be computed from returns not tail calls. Consider a void 2367 // function making a tail call to a function returning int. 2368 return DAG.getNode(X86ISD::TC_RETURN, dl, 2369 NodeTys, &Ops[0], Ops.size()); 2370 } 2371 2372 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2373 InFlag = Chain.getValue(1); 2374 2375 // Create the CALLSEQ_END node. 2376 unsigned NumBytesForCalleeToPush; 2377 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) 2378 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2379 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2380 // If this is a call to a struct-return function, the callee 2381 // pops the hidden struct pointer, so we have to push it back. 2382 // This is common for Darwin/X86, Linux & Mingw32 targets. 2383 NumBytesForCalleeToPush = 4; 2384 else 2385 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2386 2387 // Returns a flag for retval copy to use. 2388 if (!IsSibcall) { 2389 Chain = DAG.getCALLSEQ_END(Chain, 2390 DAG.getIntPtrConstant(NumBytes, true), 2391 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2392 true), 2393 InFlag); 2394 InFlag = Chain.getValue(1); 2395 } 2396 2397 // Handle result values, copying them out of physregs into vregs that we 2398 // return. 2399 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2400 Ins, dl, DAG, InVals); 2401} 2402 2403 2404//===----------------------------------------------------------------------===// 2405// Fast Calling Convention (tail call) implementation 2406//===----------------------------------------------------------------------===// 2407 2408// Like std call, callee cleans arguments, convention except that ECX is 2409// reserved for storing the tail called function address. Only 2 registers are 2410// free for argument passing (inreg). Tail call optimization is performed 2411// provided: 2412// * tailcallopt is enabled 2413// * caller/callee are fastcc 2414// On X86_64 architecture with GOT-style position independent code only local 2415// (within module) calls are supported at the moment. 2416// To keep the stack aligned according to platform abi the function 2417// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2418// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2419// If a tail called function callee has more arguments than the caller the 2420// caller needs to make sure that there is room to move the RETADDR to. This is 2421// achieved by reserving an area the size of the argument delta right after the 2422// original REtADDR, but before the saved framepointer or the spilled registers 2423// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2424// stack layout: 2425// arg1 2426// arg2 2427// RETADDR 2428// [ new RETADDR 2429// move area ] 2430// (possible EBP) 2431// ESI 2432// EDI 2433// local1 .. 2434 2435/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2436/// for a 16 byte align requirement. 2437unsigned 2438X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2439 SelectionDAG& DAG) const { 2440 MachineFunction &MF = DAG.getMachineFunction(); 2441 const TargetMachine &TM = MF.getTarget(); 2442 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2443 unsigned StackAlignment = TFI.getStackAlignment(); 2444 uint64_t AlignMask = StackAlignment - 1; 2445 int64_t Offset = StackSize; 2446 uint64_t SlotSize = TD->getPointerSize(); 2447 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2448 // Number smaller than 12 so just add the difference. 2449 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2450 } else { 2451 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2452 Offset = ((~AlignMask) & Offset) + StackAlignment + 2453 (StackAlignment-SlotSize); 2454 } 2455 return Offset; 2456} 2457 2458/// MatchingStackOffset - Return true if the given stack call argument is 2459/// already available in the same position (relatively) of the caller's 2460/// incoming argument stack. 2461static 2462bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2463 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2464 const X86InstrInfo *TII) { 2465 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2466 int FI = INT_MAX; 2467 if (Arg.getOpcode() == ISD::CopyFromReg) { 2468 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2469 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2470 return false; 2471 MachineInstr *Def = MRI->getVRegDef(VR); 2472 if (!Def) 2473 return false; 2474 if (!Flags.isByVal()) { 2475 if (!TII->isLoadFromStackSlot(Def, FI)) 2476 return false; 2477 } else { 2478 unsigned Opcode = Def->getOpcode(); 2479 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2480 Def->getOperand(1).isFI()) { 2481 FI = Def->getOperand(1).getIndex(); 2482 Bytes = Flags.getByValSize(); 2483 } else 2484 return false; 2485 } 2486 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2487 if (Flags.isByVal()) 2488 // ByVal argument is passed in as a pointer but it's now being 2489 // dereferenced. e.g. 2490 // define @foo(%struct.X* %A) { 2491 // tail call @bar(%struct.X* byval %A) 2492 // } 2493 return false; 2494 SDValue Ptr = Ld->getBasePtr(); 2495 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2496 if (!FINode) 2497 return false; 2498 FI = FINode->getIndex(); 2499 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2500 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2501 FI = FINode->getIndex(); 2502 Bytes = Flags.getByValSize(); 2503 } else 2504 return false; 2505 2506 assert(FI != INT_MAX); 2507 if (!MFI->isFixedObjectIndex(FI)) 2508 return false; 2509 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2510} 2511 2512/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2513/// for tail call optimization. Targets which want to do tail call 2514/// optimization should implement this function. 2515bool 2516X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2517 CallingConv::ID CalleeCC, 2518 bool isVarArg, 2519 bool isCalleeStructRet, 2520 bool isCallerStructRet, 2521 const SmallVectorImpl<ISD::OutputArg> &Outs, 2522 const SmallVectorImpl<SDValue> &OutVals, 2523 const SmallVectorImpl<ISD::InputArg> &Ins, 2524 SelectionDAG& DAG) const { 2525 if (!IsTailCallConvention(CalleeCC) && 2526 CalleeCC != CallingConv::C) 2527 return false; 2528 2529 // If -tailcallopt is specified, make fastcc functions tail-callable. 2530 const MachineFunction &MF = DAG.getMachineFunction(); 2531 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2532 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2533 bool CCMatch = CallerCC == CalleeCC; 2534 2535 if (GuaranteedTailCallOpt) { 2536 if (IsTailCallConvention(CalleeCC) && CCMatch) 2537 return true; 2538 return false; 2539 } 2540 2541 // Look for obvious safe cases to perform tail call optimization that do not 2542 // require ABI changes. This is what gcc calls sibcall. 2543 2544 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2545 // emit a special epilogue. 2546 if (RegInfo->needsStackRealignment(MF)) 2547 return false; 2548 2549 // Also avoid sibcall optimization if either caller or callee uses struct 2550 // return semantics. 2551 if (isCalleeStructRet || isCallerStructRet) 2552 return false; 2553 2554 // An stdcall caller is expected to clean up its arguments; the callee 2555 // isn't going to do that. 2556 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2557 return false; 2558 2559 // Do not sibcall optimize vararg calls unless all arguments are passed via 2560 // registers. 2561 if (isVarArg && !Outs.empty()) { 2562 2563 // Optimizing for varargs on Win64 is unlikely to be safe without 2564 // additional testing. 2565 if (Subtarget->isTargetWin64()) 2566 return false; 2567 2568 SmallVector<CCValAssign, 16> ArgLocs; 2569 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2570 getTargetMachine(), ArgLocs, *DAG.getContext()); 2571 2572 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2573 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2574 if (!ArgLocs[i].isRegLoc()) 2575 return false; 2576 } 2577 2578 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2579 // Therefore if it's not used by the call it is not safe to optimize this into 2580 // a sibcall. 2581 bool Unused = false; 2582 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2583 if (!Ins[i].Used) { 2584 Unused = true; 2585 break; 2586 } 2587 } 2588 if (Unused) { 2589 SmallVector<CCValAssign, 16> RVLocs; 2590 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2591 getTargetMachine(), RVLocs, *DAG.getContext()); 2592 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2593 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2594 CCValAssign &VA = RVLocs[i]; 2595 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2596 return false; 2597 } 2598 } 2599 2600 // If the calling conventions do not match, then we'd better make sure the 2601 // results are returned in the same way as what the caller expects. 2602 if (!CCMatch) { 2603 SmallVector<CCValAssign, 16> RVLocs1; 2604 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2605 getTargetMachine(), RVLocs1, *DAG.getContext()); 2606 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2607 2608 SmallVector<CCValAssign, 16> RVLocs2; 2609 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2610 getTargetMachine(), RVLocs2, *DAG.getContext()); 2611 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2612 2613 if (RVLocs1.size() != RVLocs2.size()) 2614 return false; 2615 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2616 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2617 return false; 2618 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2619 return false; 2620 if (RVLocs1[i].isRegLoc()) { 2621 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2622 return false; 2623 } else { 2624 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2625 return false; 2626 } 2627 } 2628 } 2629 2630 // If the callee takes no arguments then go on to check the results of the 2631 // call. 2632 if (!Outs.empty()) { 2633 // Check if stack adjustment is needed. For now, do not do this if any 2634 // argument is passed on the stack. 2635 SmallVector<CCValAssign, 16> ArgLocs; 2636 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2637 getTargetMachine(), ArgLocs, *DAG.getContext()); 2638 2639 // Allocate shadow area for Win64 2640 if (Subtarget->isTargetWin64()) { 2641 CCInfo.AllocateStack(32, 8); 2642 } 2643 2644 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2645 if (CCInfo.getNextStackOffset()) { 2646 MachineFunction &MF = DAG.getMachineFunction(); 2647 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2648 return false; 2649 2650 // Check if the arguments are already laid out in the right way as 2651 // the caller's fixed stack objects. 2652 MachineFrameInfo *MFI = MF.getFrameInfo(); 2653 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2654 const X86InstrInfo *TII = 2655 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2656 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2657 CCValAssign &VA = ArgLocs[i]; 2658 SDValue Arg = OutVals[i]; 2659 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2660 if (VA.getLocInfo() == CCValAssign::Indirect) 2661 return false; 2662 if (!VA.isRegLoc()) { 2663 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2664 MFI, MRI, TII)) 2665 return false; 2666 } 2667 } 2668 } 2669 2670 // If the tailcall address may be in a register, then make sure it's 2671 // possible to register allocate for it. In 32-bit, the call address can 2672 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2673 // callee-saved registers are restored. These happen to be the same 2674 // registers used to pass 'inreg' arguments so watch out for those. 2675 if (!Subtarget->is64Bit() && 2676 !isa<GlobalAddressSDNode>(Callee) && 2677 !isa<ExternalSymbolSDNode>(Callee)) { 2678 unsigned NumInRegs = 0; 2679 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2680 CCValAssign &VA = ArgLocs[i]; 2681 if (!VA.isRegLoc()) 2682 continue; 2683 unsigned Reg = VA.getLocReg(); 2684 switch (Reg) { 2685 default: break; 2686 case X86::EAX: case X86::EDX: case X86::ECX: 2687 if (++NumInRegs == 3) 2688 return false; 2689 break; 2690 } 2691 } 2692 } 2693 } 2694 2695 return true; 2696} 2697 2698FastISel * 2699X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2700 return X86::createFastISel(funcInfo); 2701} 2702 2703 2704//===----------------------------------------------------------------------===// 2705// Other Lowering Hooks 2706//===----------------------------------------------------------------------===// 2707 2708static bool MayFoldLoad(SDValue Op) { 2709 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2710} 2711 2712static bool MayFoldIntoStore(SDValue Op) { 2713 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2714} 2715 2716static bool isTargetShuffle(unsigned Opcode) { 2717 switch(Opcode) { 2718 default: return false; 2719 case X86ISD::PSHUFD: 2720 case X86ISD::PSHUFHW: 2721 case X86ISD::PSHUFLW: 2722 case X86ISD::SHUFPD: 2723 case X86ISD::PALIGN: 2724 case X86ISD::SHUFPS: 2725 case X86ISD::MOVLHPS: 2726 case X86ISD::MOVLHPD: 2727 case X86ISD::MOVHLPS: 2728 case X86ISD::MOVLPS: 2729 case X86ISD::MOVLPD: 2730 case X86ISD::MOVSHDUP: 2731 case X86ISD::MOVSLDUP: 2732 case X86ISD::MOVDDUP: 2733 case X86ISD::MOVSS: 2734 case X86ISD::MOVSD: 2735 case X86ISD::UNPCKLPS: 2736 case X86ISD::UNPCKLPD: 2737 case X86ISD::VUNPCKLPSY: 2738 case X86ISD::VUNPCKLPDY: 2739 case X86ISD::PUNPCKLWD: 2740 case X86ISD::PUNPCKLBW: 2741 case X86ISD::PUNPCKLDQ: 2742 case X86ISD::PUNPCKLQDQ: 2743 case X86ISD::UNPCKHPS: 2744 case X86ISD::UNPCKHPD: 2745 case X86ISD::VUNPCKHPSY: 2746 case X86ISD::VUNPCKHPDY: 2747 case X86ISD::PUNPCKHWD: 2748 case X86ISD::PUNPCKHBW: 2749 case X86ISD::PUNPCKHDQ: 2750 case X86ISD::PUNPCKHQDQ: 2751 case X86ISD::VPERMILPS: 2752 case X86ISD::VPERMILPSY: 2753 case X86ISD::VPERMILPD: 2754 case X86ISD::VPERMILPDY: 2755 return true; 2756 } 2757 return false; 2758} 2759 2760static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2761 SDValue V1, SelectionDAG &DAG) { 2762 switch(Opc) { 2763 default: llvm_unreachable("Unknown x86 shuffle node"); 2764 case X86ISD::MOVSHDUP: 2765 case X86ISD::MOVSLDUP: 2766 case X86ISD::MOVDDUP: 2767 return DAG.getNode(Opc, dl, VT, V1); 2768 } 2769 2770 return SDValue(); 2771} 2772 2773static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2774 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2775 switch(Opc) { 2776 default: llvm_unreachable("Unknown x86 shuffle node"); 2777 case X86ISD::PSHUFD: 2778 case X86ISD::PSHUFHW: 2779 case X86ISD::PSHUFLW: 2780 case X86ISD::VPERMILPS: 2781 case X86ISD::VPERMILPSY: 2782 case X86ISD::VPERMILPD: 2783 case X86ISD::VPERMILPDY: 2784 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2785 } 2786 2787 return SDValue(); 2788} 2789 2790static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2791 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2792 switch(Opc) { 2793 default: llvm_unreachable("Unknown x86 shuffle node"); 2794 case X86ISD::PALIGN: 2795 case X86ISD::SHUFPD: 2796 case X86ISD::SHUFPS: 2797 return DAG.getNode(Opc, dl, VT, V1, V2, 2798 DAG.getConstant(TargetMask, MVT::i8)); 2799 } 2800 return SDValue(); 2801} 2802 2803static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2804 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2805 switch(Opc) { 2806 default: llvm_unreachable("Unknown x86 shuffle node"); 2807 case X86ISD::MOVLHPS: 2808 case X86ISD::MOVLHPD: 2809 case X86ISD::MOVHLPS: 2810 case X86ISD::MOVLPS: 2811 case X86ISD::MOVLPD: 2812 case X86ISD::MOVSS: 2813 case X86ISD::MOVSD: 2814 case X86ISD::UNPCKLPS: 2815 case X86ISD::UNPCKLPD: 2816 case X86ISD::VUNPCKLPSY: 2817 case X86ISD::VUNPCKLPDY: 2818 case X86ISD::PUNPCKLWD: 2819 case X86ISD::PUNPCKLBW: 2820 case X86ISD::PUNPCKLDQ: 2821 case X86ISD::PUNPCKLQDQ: 2822 case X86ISD::UNPCKHPS: 2823 case X86ISD::UNPCKHPD: 2824 case X86ISD::VUNPCKHPSY: 2825 case X86ISD::VUNPCKHPDY: 2826 case X86ISD::PUNPCKHWD: 2827 case X86ISD::PUNPCKHBW: 2828 case X86ISD::PUNPCKHDQ: 2829 case X86ISD::PUNPCKHQDQ: 2830 return DAG.getNode(Opc, dl, VT, V1, V2); 2831 } 2832 return SDValue(); 2833} 2834 2835SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2836 MachineFunction &MF = DAG.getMachineFunction(); 2837 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2838 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2839 2840 if (ReturnAddrIndex == 0) { 2841 // Set up a frame object for the return address. 2842 uint64_t SlotSize = TD->getPointerSize(); 2843 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2844 false); 2845 FuncInfo->setRAIndex(ReturnAddrIndex); 2846 } 2847 2848 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2849} 2850 2851 2852bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2853 bool hasSymbolicDisplacement) { 2854 // Offset should fit into 32 bit immediate field. 2855 if (!isInt<32>(Offset)) 2856 return false; 2857 2858 // If we don't have a symbolic displacement - we don't have any extra 2859 // restrictions. 2860 if (!hasSymbolicDisplacement) 2861 return true; 2862 2863 // FIXME: Some tweaks might be needed for medium code model. 2864 if (M != CodeModel::Small && M != CodeModel::Kernel) 2865 return false; 2866 2867 // For small code model we assume that latest object is 16MB before end of 31 2868 // bits boundary. We may also accept pretty large negative constants knowing 2869 // that all objects are in the positive half of address space. 2870 if (M == CodeModel::Small && Offset < 16*1024*1024) 2871 return true; 2872 2873 // For kernel code model we know that all object resist in the negative half 2874 // of 32bits address space. We may not accept negative offsets, since they may 2875 // be just off and we may accept pretty large positive ones. 2876 if (M == CodeModel::Kernel && Offset > 0) 2877 return true; 2878 2879 return false; 2880} 2881 2882/// isCalleePop - Determines whether the callee is required to pop its 2883/// own arguments. Callee pop is necessary to support tail calls. 2884bool X86::isCalleePop(CallingConv::ID CallingConv, 2885 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 2886 if (IsVarArg) 2887 return false; 2888 2889 switch (CallingConv) { 2890 default: 2891 return false; 2892 case CallingConv::X86_StdCall: 2893 return !is64Bit; 2894 case CallingConv::X86_FastCall: 2895 return !is64Bit; 2896 case CallingConv::X86_ThisCall: 2897 return !is64Bit; 2898 case CallingConv::Fast: 2899 return TailCallOpt; 2900 case CallingConv::GHC: 2901 return TailCallOpt; 2902 } 2903} 2904 2905/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2906/// specific condition code, returning the condition code and the LHS/RHS of the 2907/// comparison to make. 2908static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2909 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2910 if (!isFP) { 2911 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2912 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2913 // X > -1 -> X == 0, jump !sign. 2914 RHS = DAG.getConstant(0, RHS.getValueType()); 2915 return X86::COND_NS; 2916 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2917 // X < 0 -> X == 0, jump on sign. 2918 return X86::COND_S; 2919 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2920 // X < 1 -> X <= 0 2921 RHS = DAG.getConstant(0, RHS.getValueType()); 2922 return X86::COND_LE; 2923 } 2924 } 2925 2926 switch (SetCCOpcode) { 2927 default: llvm_unreachable("Invalid integer condition!"); 2928 case ISD::SETEQ: return X86::COND_E; 2929 case ISD::SETGT: return X86::COND_G; 2930 case ISD::SETGE: return X86::COND_GE; 2931 case ISD::SETLT: return X86::COND_L; 2932 case ISD::SETLE: return X86::COND_LE; 2933 case ISD::SETNE: return X86::COND_NE; 2934 case ISD::SETULT: return X86::COND_B; 2935 case ISD::SETUGT: return X86::COND_A; 2936 case ISD::SETULE: return X86::COND_BE; 2937 case ISD::SETUGE: return X86::COND_AE; 2938 } 2939 } 2940 2941 // First determine if it is required or is profitable to flip the operands. 2942 2943 // If LHS is a foldable load, but RHS is not, flip the condition. 2944 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2945 !ISD::isNON_EXTLoad(RHS.getNode())) { 2946 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2947 std::swap(LHS, RHS); 2948 } 2949 2950 switch (SetCCOpcode) { 2951 default: break; 2952 case ISD::SETOLT: 2953 case ISD::SETOLE: 2954 case ISD::SETUGT: 2955 case ISD::SETUGE: 2956 std::swap(LHS, RHS); 2957 break; 2958 } 2959 2960 // On a floating point condition, the flags are set as follows: 2961 // ZF PF CF op 2962 // 0 | 0 | 0 | X > Y 2963 // 0 | 0 | 1 | X < Y 2964 // 1 | 0 | 0 | X == Y 2965 // 1 | 1 | 1 | unordered 2966 switch (SetCCOpcode) { 2967 default: llvm_unreachable("Condcode should be pre-legalized away"); 2968 case ISD::SETUEQ: 2969 case ISD::SETEQ: return X86::COND_E; 2970 case ISD::SETOLT: // flipped 2971 case ISD::SETOGT: 2972 case ISD::SETGT: return X86::COND_A; 2973 case ISD::SETOLE: // flipped 2974 case ISD::SETOGE: 2975 case ISD::SETGE: return X86::COND_AE; 2976 case ISD::SETUGT: // flipped 2977 case ISD::SETULT: 2978 case ISD::SETLT: return X86::COND_B; 2979 case ISD::SETUGE: // flipped 2980 case ISD::SETULE: 2981 case ISD::SETLE: return X86::COND_BE; 2982 case ISD::SETONE: 2983 case ISD::SETNE: return X86::COND_NE; 2984 case ISD::SETUO: return X86::COND_P; 2985 case ISD::SETO: return X86::COND_NP; 2986 case ISD::SETOEQ: 2987 case ISD::SETUNE: return X86::COND_INVALID; 2988 } 2989} 2990 2991/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2992/// code. Current x86 isa includes the following FP cmov instructions: 2993/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2994static bool hasFPCMov(unsigned X86CC) { 2995 switch (X86CC) { 2996 default: 2997 return false; 2998 case X86::COND_B: 2999 case X86::COND_BE: 3000 case X86::COND_E: 3001 case X86::COND_P: 3002 case X86::COND_A: 3003 case X86::COND_AE: 3004 case X86::COND_NE: 3005 case X86::COND_NP: 3006 return true; 3007 } 3008} 3009 3010/// isFPImmLegal - Returns true if the target can instruction select the 3011/// specified FP immediate natively. If false, the legalizer will 3012/// materialize the FP immediate as a load from a constant pool. 3013bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3014 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3015 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3016 return true; 3017 } 3018 return false; 3019} 3020 3021/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3022/// the specified range (L, H]. 3023static bool isUndefOrInRange(int Val, int Low, int Hi) { 3024 return (Val < 0) || (Val >= Low && Val < Hi); 3025} 3026 3027/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3028/// specified value. 3029static bool isUndefOrEqual(int Val, int CmpVal) { 3030 if (Val < 0 || Val == CmpVal) 3031 return true; 3032 return false; 3033} 3034 3035/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3036/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3037/// the second operand. 3038static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3039 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3040 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3041 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3042 return (Mask[0] < 2 && Mask[1] < 2); 3043 return false; 3044} 3045 3046bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3047 SmallVector<int, 8> M; 3048 N->getMask(M); 3049 return ::isPSHUFDMask(M, N->getValueType(0)); 3050} 3051 3052/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3053/// is suitable for input to PSHUFHW. 3054static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3055 if (VT != MVT::v8i16) 3056 return false; 3057 3058 // Lower quadword copied in order or undef. 3059 for (int i = 0; i != 4; ++i) 3060 if (Mask[i] >= 0 && Mask[i] != i) 3061 return false; 3062 3063 // Upper quadword shuffled. 3064 for (int i = 4; i != 8; ++i) 3065 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3066 return false; 3067 3068 return true; 3069} 3070 3071bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3072 SmallVector<int, 8> M; 3073 N->getMask(M); 3074 return ::isPSHUFHWMask(M, N->getValueType(0)); 3075} 3076 3077/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3078/// is suitable for input to PSHUFLW. 3079static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3080 if (VT != MVT::v8i16) 3081 return false; 3082 3083 // Upper quadword copied in order. 3084 for (int i = 4; i != 8; ++i) 3085 if (Mask[i] >= 0 && Mask[i] != i) 3086 return false; 3087 3088 // Lower quadword shuffled. 3089 for (int i = 0; i != 4; ++i) 3090 if (Mask[i] >= 4) 3091 return false; 3092 3093 return true; 3094} 3095 3096bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3097 SmallVector<int, 8> M; 3098 N->getMask(M); 3099 return ::isPSHUFLWMask(M, N->getValueType(0)); 3100} 3101 3102/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3103/// is suitable for input to PALIGNR. 3104static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3105 bool hasSSSE3) { 3106 int i, e = VT.getVectorNumElements(); 3107 if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) 3108 return false; 3109 3110 // Do not handle v2i64 / v2f64 shuffles with palignr. 3111 if (e < 4 || !hasSSSE3) 3112 return false; 3113 3114 for (i = 0; i != e; ++i) 3115 if (Mask[i] >= 0) 3116 break; 3117 3118 // All undef, not a palignr. 3119 if (i == e) 3120 return false; 3121 3122 // Make sure we're shifting in the right direction. 3123 if (Mask[i] <= i) 3124 return false; 3125 3126 int s = Mask[i] - i; 3127 3128 // Check the rest of the elements to see if they are consecutive. 3129 for (++i; i != e; ++i) { 3130 int m = Mask[i]; 3131 if (m >= 0 && m != s+i) 3132 return false; 3133 } 3134 return true; 3135} 3136 3137/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3138/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3139static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3140 int NumElems = VT.getVectorNumElements(); 3141 if (NumElems != 2 && NumElems != 4) 3142 return false; 3143 3144 int Half = NumElems / 2; 3145 for (int i = 0; i < Half; ++i) 3146 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3147 return false; 3148 for (int i = Half; i < NumElems; ++i) 3149 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3150 return false; 3151 3152 return true; 3153} 3154 3155bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3156 SmallVector<int, 8> M; 3157 N->getMask(M); 3158 return ::isSHUFPMask(M, N->getValueType(0)); 3159} 3160 3161/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3162/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3163/// half elements to come from vector 1 (which would equal the dest.) and 3164/// the upper half to come from vector 2. 3165static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3166 int NumElems = VT.getVectorNumElements(); 3167 3168 if (NumElems != 2 && NumElems != 4) 3169 return false; 3170 3171 int Half = NumElems / 2; 3172 for (int i = 0; i < Half; ++i) 3173 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3174 return false; 3175 for (int i = Half; i < NumElems; ++i) 3176 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3177 return false; 3178 return true; 3179} 3180 3181static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3182 SmallVector<int, 8> M; 3183 N->getMask(M); 3184 return isCommutedSHUFPMask(M, N->getValueType(0)); 3185} 3186 3187/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3188/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3189bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3190 EVT VT = N->getValueType(0); 3191 unsigned NumElems = VT.getVectorNumElements(); 3192 3193 if (VT.getSizeInBits() != 128) 3194 return false; 3195 3196 if (NumElems != 4) 3197 return false; 3198 3199 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3200 return isUndefOrEqual(N->getMaskElt(0), 6) && 3201 isUndefOrEqual(N->getMaskElt(1), 7) && 3202 isUndefOrEqual(N->getMaskElt(2), 2) && 3203 isUndefOrEqual(N->getMaskElt(3), 3); 3204} 3205 3206/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3207/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3208/// <2, 3, 2, 3> 3209bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3210 EVT VT = N->getValueType(0); 3211 unsigned NumElems = VT.getVectorNumElements(); 3212 3213 if (VT.getSizeInBits() != 128) 3214 return false; 3215 3216 if (NumElems != 4) 3217 return false; 3218 3219 return isUndefOrEqual(N->getMaskElt(0), 2) && 3220 isUndefOrEqual(N->getMaskElt(1), 3) && 3221 isUndefOrEqual(N->getMaskElt(2), 2) && 3222 isUndefOrEqual(N->getMaskElt(3), 3); 3223} 3224 3225/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3226/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3227bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3228 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3229 3230 if (NumElems != 2 && NumElems != 4) 3231 return false; 3232 3233 for (unsigned i = 0; i < NumElems/2; ++i) 3234 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3235 return false; 3236 3237 for (unsigned i = NumElems/2; i < NumElems; ++i) 3238 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3239 return false; 3240 3241 return true; 3242} 3243 3244/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3245/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3246bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3247 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3248 3249 if ((NumElems != 2 && NumElems != 4) 3250 || N->getValueType(0).getSizeInBits() > 128) 3251 return false; 3252 3253 for (unsigned i = 0; i < NumElems/2; ++i) 3254 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3255 return false; 3256 3257 for (unsigned i = 0; i < NumElems/2; ++i) 3258 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3259 return false; 3260 3261 return true; 3262} 3263 3264/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3265/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3266static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3267 bool V2IsSplat = false) { 3268 int NumElts = VT.getVectorNumElements(); 3269 3270 assert((VT.is128BitVector() || VT.is256BitVector()) && 3271 "Unsupported vector type for unpckh"); 3272 3273 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3274 return false; 3275 3276 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3277 // independently on 128-bit lanes. 3278 unsigned NumLanes = VT.getSizeInBits()/128; 3279 unsigned NumLaneElts = NumElts/NumLanes; 3280 3281 unsigned Start = 0; 3282 unsigned End = NumLaneElts; 3283 for (unsigned s = 0; s < NumLanes; ++s) { 3284 for (unsigned i = Start, j = s * NumLaneElts; 3285 i != End; 3286 i += 2, ++j) { 3287 int BitI = Mask[i]; 3288 int BitI1 = Mask[i+1]; 3289 if (!isUndefOrEqual(BitI, j)) 3290 return false; 3291 if (V2IsSplat) { 3292 if (!isUndefOrEqual(BitI1, NumElts)) 3293 return false; 3294 } else { 3295 if (!isUndefOrEqual(BitI1, j + NumElts)) 3296 return false; 3297 } 3298 } 3299 // Process the next 128 bits. 3300 Start += NumLaneElts; 3301 End += NumLaneElts; 3302 } 3303 3304 return true; 3305} 3306 3307bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3308 SmallVector<int, 8> M; 3309 N->getMask(M); 3310 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3311} 3312 3313/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3314/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3315static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3316 bool V2IsSplat = false) { 3317 int NumElts = VT.getVectorNumElements(); 3318 3319 assert((VT.is128BitVector() || VT.is256BitVector()) && 3320 "Unsupported vector type for unpckh"); 3321 3322 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3323 return false; 3324 3325 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3326 // independently on 128-bit lanes. 3327 unsigned NumLanes = VT.getSizeInBits()/128; 3328 unsigned NumLaneElts = NumElts/NumLanes; 3329 3330 unsigned Start = 0; 3331 unsigned End = NumLaneElts; 3332 for (unsigned l = 0; l != NumLanes; ++l) { 3333 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; 3334 i != End; i += 2, ++j) { 3335 int BitI = Mask[i]; 3336 int BitI1 = Mask[i+1]; 3337 if (!isUndefOrEqual(BitI, j)) 3338 return false; 3339 if (V2IsSplat) { 3340 if (isUndefOrEqual(BitI1, NumElts)) 3341 return false; 3342 } else { 3343 if (!isUndefOrEqual(BitI1, j+NumElts)) 3344 return false; 3345 } 3346 } 3347 // Process the next 128 bits. 3348 Start += NumLaneElts; 3349 End += NumLaneElts; 3350 } 3351 return true; 3352} 3353 3354bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3355 SmallVector<int, 8> M; 3356 N->getMask(M); 3357 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3358} 3359 3360/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3361/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3362/// <0, 0, 1, 1> 3363static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3364 int NumElems = VT.getVectorNumElements(); 3365 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3366 return false; 3367 3368 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3369 // independently on 128-bit lanes. 3370 unsigned NumLanes = VT.getSizeInBits() / 128; 3371 unsigned NumLaneElts = NumElems / NumLanes; 3372 3373 for (unsigned s = 0; s < NumLanes; ++s) { 3374 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; 3375 i != NumLaneElts * (s + 1); 3376 i += 2, ++j) { 3377 int BitI = Mask[i]; 3378 int BitI1 = Mask[i+1]; 3379 3380 if (!isUndefOrEqual(BitI, j)) 3381 return false; 3382 if (!isUndefOrEqual(BitI1, j)) 3383 return false; 3384 } 3385 } 3386 3387 return true; 3388} 3389 3390bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3391 SmallVector<int, 8> M; 3392 N->getMask(M); 3393 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3394} 3395 3396/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3397/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3398/// <2, 2, 3, 3> 3399static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3400 int NumElems = VT.getVectorNumElements(); 3401 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3402 return false; 3403 3404 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3405 int BitI = Mask[i]; 3406 int BitI1 = Mask[i+1]; 3407 if (!isUndefOrEqual(BitI, j)) 3408 return false; 3409 if (!isUndefOrEqual(BitI1, j)) 3410 return false; 3411 } 3412 return true; 3413} 3414 3415bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3416 SmallVector<int, 8> M; 3417 N->getMask(M); 3418 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3419} 3420 3421/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3422/// specifies a shuffle of elements that is suitable for input to MOVSS, 3423/// MOVSD, and MOVD, i.e. setting the lowest element. 3424static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3425 if (VT.getVectorElementType().getSizeInBits() < 32) 3426 return false; 3427 3428 int NumElts = VT.getVectorNumElements(); 3429 3430 if (!isUndefOrEqual(Mask[0], NumElts)) 3431 return false; 3432 3433 for (int i = 1; i < NumElts; ++i) 3434 if (!isUndefOrEqual(Mask[i], i)) 3435 return false; 3436 3437 return true; 3438} 3439 3440bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3441 SmallVector<int, 8> M; 3442 N->getMask(M); 3443 return ::isMOVLMask(M, N->getValueType(0)); 3444} 3445 3446/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand 3447/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3448/// Note that VPERMIL mask matching is different depending whether theunderlying 3449/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3450/// to the same elements of the low, but to the higher half of the source. 3451/// In VPERMILPD the two lanes could be shuffled independently of each other 3452/// with the same restriction that lanes can't be crossed. 3453static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, 3454 const X86Subtarget *Subtarget) { 3455 int NumElts = VT.getVectorNumElements(); 3456 int NumLanes = VT.getSizeInBits()/128; 3457 3458 if (!Subtarget->hasAVX()) 3459 return false; 3460 3461 // Match any permutation of 128-bit vector with 64-bit types 3462 if (NumLanes == 1 && NumElts != 2) 3463 return false; 3464 3465 // Only match 256-bit with 32 types 3466 if (VT.getSizeInBits() == 256 && NumElts != 4) 3467 return false; 3468 3469 // The mask on the high lane is independent of the low. Both can match 3470 // any element in inside its own lane, but can't cross. 3471 int LaneSize = NumElts/NumLanes; 3472 for (int l = 0; l < NumLanes; ++l) 3473 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3474 int LaneStart = l*LaneSize; 3475 if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) 3476 return false; 3477 } 3478 3479 return true; 3480} 3481 3482/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand 3483/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. 3484/// Note that VPERMIL mask matching is different depending whether theunderlying 3485/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3486/// to the same elements of the low, but to the higher half of the source. 3487/// In VPERMILPD the two lanes could be shuffled independently of each other 3488/// with the same restriction that lanes can't be crossed. 3489static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, 3490 const X86Subtarget *Subtarget) { 3491 unsigned NumElts = VT.getVectorNumElements(); 3492 unsigned NumLanes = VT.getSizeInBits()/128; 3493 3494 if (!Subtarget->hasAVX()) 3495 return false; 3496 3497 // Match any permutation of 128-bit vector with 32-bit types 3498 if (NumLanes == 1 && NumElts != 4) 3499 return false; 3500 3501 // Only match 256-bit with 32 types 3502 if (VT.getSizeInBits() == 256 && NumElts != 8) 3503 return false; 3504 3505 // The mask on the high lane should be the same as the low. Actually, 3506 // they can differ if any of the corresponding index in a lane is undef 3507 // and the other stays in range. 3508 int LaneSize = NumElts/NumLanes; 3509 for (int i = 0; i < LaneSize; ++i) { 3510 int HighElt = i+LaneSize; 3511 if (Mask[i] < 0 && (isUndefOrInRange(Mask[HighElt], LaneSize, NumElts))) 3512 continue; 3513 if (Mask[HighElt] < 0 && (isUndefOrInRange(Mask[i], 0, LaneSize))) 3514 continue; 3515 if (Mask[HighElt]-Mask[i] != LaneSize) 3516 return false; 3517 } 3518 3519 return true; 3520} 3521 3522/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle 3523/// the specified VECTOR_MASK mask with VPERMILPS* instructions. 3524static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { 3525 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3526 EVT VT = SVOp->getValueType(0); 3527 3528 int NumElts = VT.getVectorNumElements(); 3529 int NumLanes = VT.getSizeInBits()/128; 3530 int LaneSize = NumElts/NumLanes; 3531 3532 // Although the mask is equal for both lanes do it twice to get the cases 3533 // where a mask will match because the same mask element is undef on the 3534 // first half but valid on the second. This would get pathological cases 3535 // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. 3536 unsigned Mask = 0; 3537 for (int l = 0; l < NumLanes; ++l) { 3538 for (int i = 0; i < LaneSize; ++i) { 3539 int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); 3540 if (MaskElt < 0) 3541 continue; 3542 if (MaskElt >= LaneSize) 3543 MaskElt -= LaneSize; 3544 Mask |= MaskElt << (i*2); 3545 } 3546 } 3547 3548 return Mask; 3549} 3550 3551/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle 3552/// the specified VECTOR_MASK mask with VPERMILPD* instructions. 3553static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { 3554 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3555 EVT VT = SVOp->getValueType(0); 3556 3557 int NumElts = VT.getVectorNumElements(); 3558 int NumLanes = VT.getSizeInBits()/128; 3559 3560 unsigned Mask = 0; 3561 int LaneSize = NumElts/NumLanes; 3562 for (int l = 0; l < NumLanes; ++l) 3563 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3564 int MaskElt = SVOp->getMaskElt(i); 3565 if (MaskElt < 0) 3566 continue; 3567 Mask |= (MaskElt-l*LaneSize) << i; 3568 } 3569 3570 return Mask; 3571} 3572 3573/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3574/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3575/// element of vector 2 and the other elements to come from vector 1 in order. 3576static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3577 bool V2IsSplat = false, bool V2IsUndef = false) { 3578 int NumOps = VT.getVectorNumElements(); 3579 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3580 return false; 3581 3582 if (!isUndefOrEqual(Mask[0], 0)) 3583 return false; 3584 3585 for (int i = 1; i < NumOps; ++i) 3586 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3587 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3588 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3589 return false; 3590 3591 return true; 3592} 3593 3594static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3595 bool V2IsUndef = false) { 3596 SmallVector<int, 8> M; 3597 N->getMask(M); 3598 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3599} 3600 3601/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3602/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3603/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3604bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3605 const X86Subtarget *Subtarget) { 3606 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3607 return false; 3608 3609 // The second vector must be undef 3610 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3611 return false; 3612 3613 EVT VT = N->getValueType(0); 3614 unsigned NumElems = VT.getVectorNumElements(); 3615 3616 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3617 (VT.getSizeInBits() == 256 && NumElems != 8)) 3618 return false; 3619 3620 // "i+1" is the value the indexed mask element must have 3621 for (unsigned i = 0; i < NumElems; i += 2) 3622 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3623 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3624 return false; 3625 3626 return true; 3627} 3628 3629/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3630/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3631/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3632bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3633 const X86Subtarget *Subtarget) { 3634 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3635 return false; 3636 3637 // The second vector must be undef 3638 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3639 return false; 3640 3641 EVT VT = N->getValueType(0); 3642 unsigned NumElems = VT.getVectorNumElements(); 3643 3644 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3645 (VT.getSizeInBits() == 256 && NumElems != 8)) 3646 return false; 3647 3648 // "i" is the value the indexed mask element must have 3649 for (unsigned i = 0; i < NumElems; i += 2) 3650 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3651 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3652 return false; 3653 3654 return true; 3655} 3656 3657/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3658/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3659bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3660 int e = N->getValueType(0).getVectorNumElements() / 2; 3661 3662 for (int i = 0; i < e; ++i) 3663 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3664 return false; 3665 for (int i = 0; i < e; ++i) 3666 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3667 return false; 3668 return true; 3669} 3670 3671/// isVEXTRACTF128Index - Return true if the specified 3672/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3673/// suitable for input to VEXTRACTF128. 3674bool X86::isVEXTRACTF128Index(SDNode *N) { 3675 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3676 return false; 3677 3678 // The index should be aligned on a 128-bit boundary. 3679 uint64_t Index = 3680 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3681 3682 unsigned VL = N->getValueType(0).getVectorNumElements(); 3683 unsigned VBits = N->getValueType(0).getSizeInBits(); 3684 unsigned ElSize = VBits / VL; 3685 bool Result = (Index * ElSize) % 128 == 0; 3686 3687 return Result; 3688} 3689 3690/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3691/// operand specifies a subvector insert that is suitable for input to 3692/// VINSERTF128. 3693bool X86::isVINSERTF128Index(SDNode *N) { 3694 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3695 return false; 3696 3697 // The index should be aligned on a 128-bit boundary. 3698 uint64_t Index = 3699 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3700 3701 unsigned VL = N->getValueType(0).getVectorNumElements(); 3702 unsigned VBits = N->getValueType(0).getSizeInBits(); 3703 unsigned ElSize = VBits / VL; 3704 bool Result = (Index * ElSize) % 128 == 0; 3705 3706 return Result; 3707} 3708 3709/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3710/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3711unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3712 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3713 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3714 3715 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3716 unsigned Mask = 0; 3717 for (int i = 0; i < NumOperands; ++i) { 3718 int Val = SVOp->getMaskElt(NumOperands-i-1); 3719 if (Val < 0) Val = 0; 3720 if (Val >= NumOperands) Val -= NumOperands; 3721 Mask |= Val; 3722 if (i != NumOperands - 1) 3723 Mask <<= Shift; 3724 } 3725 return Mask; 3726} 3727 3728/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3729/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3730unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3731 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3732 unsigned Mask = 0; 3733 // 8 nodes, but we only care about the last 4. 3734 for (unsigned i = 7; i >= 4; --i) { 3735 int Val = SVOp->getMaskElt(i); 3736 if (Val >= 0) 3737 Mask |= (Val - 4); 3738 if (i != 4) 3739 Mask <<= 2; 3740 } 3741 return Mask; 3742} 3743 3744/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3745/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3746unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3747 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3748 unsigned Mask = 0; 3749 // 8 nodes, but we only care about the first 4. 3750 for (int i = 3; i >= 0; --i) { 3751 int Val = SVOp->getMaskElt(i); 3752 if (Val >= 0) 3753 Mask |= Val; 3754 if (i != 0) 3755 Mask <<= 2; 3756 } 3757 return Mask; 3758} 3759 3760/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3761/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3762unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3763 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3764 EVT VVT = N->getValueType(0); 3765 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3766 int Val = 0; 3767 3768 unsigned i, e; 3769 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3770 Val = SVOp->getMaskElt(i); 3771 if (Val >= 0) 3772 break; 3773 } 3774 assert(Val - i > 0 && "PALIGNR imm should be positive"); 3775 return (Val - i) * EltSize; 3776} 3777 3778/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3779/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3780/// instructions. 3781unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3782 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3783 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3784 3785 uint64_t Index = 3786 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3787 3788 EVT VecVT = N->getOperand(0).getValueType(); 3789 EVT ElVT = VecVT.getVectorElementType(); 3790 3791 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3792 return Index / NumElemsPerChunk; 3793} 3794 3795/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3796/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3797/// instructions. 3798unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3799 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3800 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3801 3802 uint64_t Index = 3803 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3804 3805 EVT VecVT = N->getValueType(0); 3806 EVT ElVT = VecVT.getVectorElementType(); 3807 3808 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3809 return Index / NumElemsPerChunk; 3810} 3811 3812/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3813/// constant +0.0. 3814bool X86::isZeroNode(SDValue Elt) { 3815 return ((isa<ConstantSDNode>(Elt) && 3816 cast<ConstantSDNode>(Elt)->isNullValue()) || 3817 (isa<ConstantFPSDNode>(Elt) && 3818 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3819} 3820 3821/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3822/// their permute mask. 3823static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3824 SelectionDAG &DAG) { 3825 EVT VT = SVOp->getValueType(0); 3826 unsigned NumElems = VT.getVectorNumElements(); 3827 SmallVector<int, 8> MaskVec; 3828 3829 for (unsigned i = 0; i != NumElems; ++i) { 3830 int idx = SVOp->getMaskElt(i); 3831 if (idx < 0) 3832 MaskVec.push_back(idx); 3833 else if (idx < (int)NumElems) 3834 MaskVec.push_back(idx + NumElems); 3835 else 3836 MaskVec.push_back(idx - NumElems); 3837 } 3838 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3839 SVOp->getOperand(0), &MaskVec[0]); 3840} 3841 3842/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3843/// the two vector operands have swapped position. 3844static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3845 unsigned NumElems = VT.getVectorNumElements(); 3846 for (unsigned i = 0; i != NumElems; ++i) { 3847 int idx = Mask[i]; 3848 if (idx < 0) 3849 continue; 3850 else if (idx < (int)NumElems) 3851 Mask[i] = idx + NumElems; 3852 else 3853 Mask[i] = idx - NumElems; 3854 } 3855} 3856 3857/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3858/// match movhlps. The lower half elements should come from upper half of 3859/// V1 (and in order), and the upper half elements should come from the upper 3860/// half of V2 (and in order). 3861static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3862 if (Op->getValueType(0).getVectorNumElements() != 4) 3863 return false; 3864 for (unsigned i = 0, e = 2; i != e; ++i) 3865 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3866 return false; 3867 for (unsigned i = 2; i != 4; ++i) 3868 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3869 return false; 3870 return true; 3871} 3872 3873/// isScalarLoadToVector - Returns true if the node is a scalar load that 3874/// is promoted to a vector. It also returns the LoadSDNode by reference if 3875/// required. 3876static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3877 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3878 return false; 3879 N = N->getOperand(0).getNode(); 3880 if (!ISD::isNON_EXTLoad(N)) 3881 return false; 3882 if (LD) 3883 *LD = cast<LoadSDNode>(N); 3884 return true; 3885} 3886 3887/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3888/// match movlp{s|d}. The lower half elements should come from lower half of 3889/// V1 (and in order), and the upper half elements should come from the upper 3890/// half of V2 (and in order). And since V1 will become the source of the 3891/// MOVLP, it must be either a vector load or a scalar load to vector. 3892static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3893 ShuffleVectorSDNode *Op) { 3894 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3895 return false; 3896 // Is V2 is a vector load, don't do this transformation. We will try to use 3897 // load folding shufps op. 3898 if (ISD::isNON_EXTLoad(V2)) 3899 return false; 3900 3901 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3902 3903 if (NumElems != 2 && NumElems != 4) 3904 return false; 3905 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3906 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3907 return false; 3908 for (unsigned i = NumElems/2; i != NumElems; ++i) 3909 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3910 return false; 3911 return true; 3912} 3913 3914/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3915/// all the same. 3916static bool isSplatVector(SDNode *N) { 3917 if (N->getOpcode() != ISD::BUILD_VECTOR) 3918 return false; 3919 3920 SDValue SplatValue = N->getOperand(0); 3921 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3922 if (N->getOperand(i) != SplatValue) 3923 return false; 3924 return true; 3925} 3926 3927/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3928/// to an zero vector. 3929/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3930static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3931 SDValue V1 = N->getOperand(0); 3932 SDValue V2 = N->getOperand(1); 3933 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3934 for (unsigned i = 0; i != NumElems; ++i) { 3935 int Idx = N->getMaskElt(i); 3936 if (Idx >= (int)NumElems) { 3937 unsigned Opc = V2.getOpcode(); 3938 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3939 continue; 3940 if (Opc != ISD::BUILD_VECTOR || 3941 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3942 return false; 3943 } else if (Idx >= 0) { 3944 unsigned Opc = V1.getOpcode(); 3945 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3946 continue; 3947 if (Opc != ISD::BUILD_VECTOR || 3948 !X86::isZeroNode(V1.getOperand(Idx))) 3949 return false; 3950 } 3951 } 3952 return true; 3953} 3954 3955/// getZeroVector - Returns a vector of specified type with all zero elements. 3956/// 3957static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3958 DebugLoc dl) { 3959 assert(VT.isVector() && "Expected a vector type"); 3960 3961 // Always build SSE zero vectors as <4 x i32> bitcasted 3962 // to their dest type. This ensures they get CSE'd. 3963 SDValue Vec; 3964 if (VT.getSizeInBits() == 128) { // SSE 3965 if (HasSSE2) { // SSE2 3966 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3967 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3968 } else { // SSE1 3969 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3970 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3971 } 3972 } else if (VT.getSizeInBits() == 256) { // AVX 3973 // 256-bit logic and arithmetic instructions in AVX are 3974 // all floating-point, no support for integer ops. Default 3975 // to emitting fp zeroed vectors then. 3976 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3977 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3978 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3979 } 3980 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3981} 3982 3983/// getOnesVector - Returns a vector of specified type with all bits set. 3984/// Always build ones vectors as <4 x i32>. For 256-bit types, use two 3985/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their 3986/// original type, ensuring they get CSE'd. 3987static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3988 assert(VT.isVector() && "Expected a vector type"); 3989 assert((VT.is128BitVector() || VT.is256BitVector()) 3990 && "Expected a 128-bit or 256-bit vector type"); 3991 3992 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3993 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 3994 Cst, Cst, Cst, Cst); 3995 3996 if (VT.is256BitVector()) { 3997 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 3998 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 3999 Vec = Insert128BitVector(InsV, Vec, 4000 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 4001 } 4002 4003 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4004} 4005 4006/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4007/// that point to V2 points to its first element. 4008static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4009 EVT VT = SVOp->getValueType(0); 4010 unsigned NumElems = VT.getVectorNumElements(); 4011 4012 bool Changed = false; 4013 SmallVector<int, 8> MaskVec; 4014 SVOp->getMask(MaskVec); 4015 4016 for (unsigned i = 0; i != NumElems; ++i) { 4017 if (MaskVec[i] > (int)NumElems) { 4018 MaskVec[i] = NumElems; 4019 Changed = true; 4020 } 4021 } 4022 if (Changed) 4023 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 4024 SVOp->getOperand(1), &MaskVec[0]); 4025 return SDValue(SVOp, 0); 4026} 4027 4028/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4029/// operation of specified width. 4030static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4031 SDValue V2) { 4032 unsigned NumElems = VT.getVectorNumElements(); 4033 SmallVector<int, 8> Mask; 4034 Mask.push_back(NumElems); 4035 for (unsigned i = 1; i != NumElems; ++i) 4036 Mask.push_back(i); 4037 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4038} 4039 4040/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4041static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4042 SDValue V2) { 4043 unsigned NumElems = VT.getVectorNumElements(); 4044 SmallVector<int, 8> Mask; 4045 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4046 Mask.push_back(i); 4047 Mask.push_back(i + NumElems); 4048 } 4049 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4050} 4051 4052/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4053static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4054 SDValue V2) { 4055 unsigned NumElems = VT.getVectorNumElements(); 4056 unsigned Half = NumElems/2; 4057 SmallVector<int, 8> Mask; 4058 for (unsigned i = 0; i != Half; ++i) { 4059 Mask.push_back(i + Half); 4060 Mask.push_back(i + NumElems + Half); 4061 } 4062 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4063} 4064 4065// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by 4066// a generic shuffle instruction because the target has no such instructions. 4067// Generate shuffles which repeat i16 and i8 several times until they can be 4068// represented by v4f32 and then be manipulated by target suported shuffles. 4069static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4070 EVT VT = V.getValueType(); 4071 int NumElems = VT.getVectorNumElements(); 4072 DebugLoc dl = V.getDebugLoc(); 4073 4074 while (NumElems > 4) { 4075 if (EltNo < NumElems/2) { 4076 V = getUnpackl(DAG, dl, VT, V, V); 4077 } else { 4078 V = getUnpackh(DAG, dl, VT, V, V); 4079 EltNo -= NumElems/2; 4080 } 4081 NumElems >>= 1; 4082 } 4083 return V; 4084} 4085 4086/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4087static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4088 EVT VT = V.getValueType(); 4089 DebugLoc dl = V.getDebugLoc(); 4090 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4091 && "Vector size not supported"); 4092 4093 bool Is128 = VT.getSizeInBits() == 128; 4094 EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32; 4095 V = DAG.getNode(ISD::BITCAST, dl, NVT, V); 4096 4097 if (Is128) { 4098 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4099 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4100 } else { 4101 // The second half of indicies refer to the higher part, which is a 4102 // duplication of the lower one. This makes this shuffle a perfect match 4103 // for the VPERM instruction. 4104 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4105 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4106 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4107 } 4108 4109 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4110} 4111 4112/// PromoteVectorToScalarSplat - Since there's no native support for 4113/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector + 4114/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the 4115/// shuffle before the insertion, this yields less instructions in the end. 4116static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV, 4117 SelectionDAG &DAG) { 4118 EVT SrcVT = SV->getValueType(0); 4119 SDValue V1 = SV->getOperand(0); 4120 DebugLoc dl = SV->getDebugLoc(); 4121 int NumElems = SrcVT.getVectorNumElements(); 4122 4123 assert(SrcVT.is256BitVector() && "unknown howto handle vector type"); 4124 4125 SmallVector<int, 4> Mask; 4126 for (int i = 0; i < NumElems/2; ++i) 4127 Mask.push_back(SV->getMaskElt(i)); 4128 4129 EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 4130 NumElems/2); 4131 SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1), 4132 DAG.getUNDEF(SVT), &Mask[0]); 4133 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1, 4134 DAG.getConstant(0, MVT::i32), DAG, dl); 4135 4136 return Insert128BitVector(InsV, SV1, 4137 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4138} 4139 4140/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and 4141/// v8i32, v16i16 or v32i8 to v8f32. 4142static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4143 EVT SrcVT = SV->getValueType(0); 4144 SDValue V1 = SV->getOperand(0); 4145 DebugLoc dl = SV->getDebugLoc(); 4146 4147 int EltNo = SV->getSplatIndex(); 4148 int NumElems = SrcVT.getVectorNumElements(); 4149 unsigned Size = SrcVT.getSizeInBits(); 4150 4151 // Extract the 128-bit part containing the splat element and update 4152 // the splat element index when it refers to the higher register. 4153 if (Size == 256) { 4154 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4155 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4156 if (Idx > 0) 4157 EltNo -= NumElems/2; 4158 } 4159 4160 // Make this 128-bit vector duplicate i8 and i16 elements 4161 if (NumElems > 4) 4162 V1 = PromoteSplatv8v16(V1, DAG, EltNo); 4163 4164 // Recreate the 256-bit vector and place the same 128-bit vector 4165 // into the low and high part. This is necessary because we want 4166 // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles 4167 // inside each separate v4f32 lane. 4168 if (Size == 256) { 4169 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4170 DAG.getConstant(0, MVT::i32), DAG, dl); 4171 V1 = Insert128BitVector(InsV, V1, 4172 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4173 } 4174 4175 return getLegalSplat(DAG, V1, EltNo); 4176} 4177 4178/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4179/// vector of zero or undef vector. This produces a shuffle where the low 4180/// element of V2 is swizzled into the zero/undef vector, landing at element 4181/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4182static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4183 bool isZero, bool HasSSE2, 4184 SelectionDAG &DAG) { 4185 EVT VT = V2.getValueType(); 4186 SDValue V1 = isZero 4187 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4188 unsigned NumElems = VT.getVectorNumElements(); 4189 SmallVector<int, 16> MaskVec; 4190 for (unsigned i = 0; i != NumElems; ++i) 4191 // If this is the insertion idx, put the low elt of V2 here. 4192 MaskVec.push_back(i == Idx ? NumElems : i); 4193 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4194} 4195 4196/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4197/// element of the result of the vector shuffle. 4198static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4199 unsigned Depth) { 4200 if (Depth == 6) 4201 return SDValue(); // Limit search depth. 4202 4203 SDValue V = SDValue(N, 0); 4204 EVT VT = V.getValueType(); 4205 unsigned Opcode = V.getOpcode(); 4206 4207 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4208 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4209 Index = SV->getMaskElt(Index); 4210 4211 if (Index < 0) 4212 return DAG.getUNDEF(VT.getVectorElementType()); 4213 4214 int NumElems = VT.getVectorNumElements(); 4215 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4216 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4217 } 4218 4219 // Recurse into target specific vector shuffles to find scalars. 4220 if (isTargetShuffle(Opcode)) { 4221 int NumElems = VT.getVectorNumElements(); 4222 SmallVector<unsigned, 16> ShuffleMask; 4223 SDValue ImmN; 4224 4225 switch(Opcode) { 4226 case X86ISD::SHUFPS: 4227 case X86ISD::SHUFPD: 4228 ImmN = N->getOperand(N->getNumOperands()-1); 4229 DecodeSHUFPSMask(NumElems, 4230 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4231 ShuffleMask); 4232 break; 4233 case X86ISD::PUNPCKHBW: 4234 case X86ISD::PUNPCKHWD: 4235 case X86ISD::PUNPCKHDQ: 4236 case X86ISD::PUNPCKHQDQ: 4237 DecodePUNPCKHMask(NumElems, ShuffleMask); 4238 break; 4239 case X86ISD::UNPCKHPS: 4240 case X86ISD::UNPCKHPD: 4241 case X86ISD::VUNPCKHPSY: 4242 case X86ISD::VUNPCKHPDY: 4243 DecodeUNPCKHPMask(NumElems, ShuffleMask); 4244 break; 4245 case X86ISD::PUNPCKLBW: 4246 case X86ISD::PUNPCKLWD: 4247 case X86ISD::PUNPCKLDQ: 4248 case X86ISD::PUNPCKLQDQ: 4249 DecodePUNPCKLMask(VT, ShuffleMask); 4250 break; 4251 case X86ISD::UNPCKLPS: 4252 case X86ISD::UNPCKLPD: 4253 case X86ISD::VUNPCKLPSY: 4254 case X86ISD::VUNPCKLPDY: 4255 DecodeUNPCKLPMask(VT, ShuffleMask); 4256 break; 4257 case X86ISD::MOVHLPS: 4258 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4259 break; 4260 case X86ISD::MOVLHPS: 4261 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4262 break; 4263 case X86ISD::PSHUFD: 4264 ImmN = N->getOperand(N->getNumOperands()-1); 4265 DecodePSHUFMask(NumElems, 4266 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4267 ShuffleMask); 4268 break; 4269 case X86ISD::PSHUFHW: 4270 ImmN = N->getOperand(N->getNumOperands()-1); 4271 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4272 ShuffleMask); 4273 break; 4274 case X86ISD::PSHUFLW: 4275 ImmN = N->getOperand(N->getNumOperands()-1); 4276 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4277 ShuffleMask); 4278 break; 4279 case X86ISD::MOVSS: 4280 case X86ISD::MOVSD: { 4281 // The index 0 always comes from the first element of the second source, 4282 // this is why MOVSS and MOVSD are used in the first place. The other 4283 // elements come from the other positions of the first source vector. 4284 unsigned OpNum = (Index == 0) ? 1 : 0; 4285 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4286 Depth+1); 4287 } 4288 case X86ISD::VPERMILPS: 4289 ImmN = N->getOperand(N->getNumOperands()-1); 4290 DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4291 ShuffleMask); 4292 break; 4293 case X86ISD::VPERMILPSY: 4294 ImmN = N->getOperand(N->getNumOperands()-1); 4295 DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4296 ShuffleMask); 4297 break; 4298 case X86ISD::VPERMILPD: 4299 ImmN = N->getOperand(N->getNumOperands()-1); 4300 DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4301 ShuffleMask); 4302 break; 4303 case X86ISD::VPERMILPDY: 4304 ImmN = N->getOperand(N->getNumOperands()-1); 4305 DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4306 ShuffleMask); 4307 break; 4308 default: 4309 assert("not implemented for target shuffle node"); 4310 return SDValue(); 4311 } 4312 4313 Index = ShuffleMask[Index]; 4314 if (Index < 0) 4315 return DAG.getUNDEF(VT.getVectorElementType()); 4316 4317 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4318 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4319 Depth+1); 4320 } 4321 4322 // Actual nodes that may contain scalar elements 4323 if (Opcode == ISD::BITCAST) { 4324 V = V.getOperand(0); 4325 EVT SrcVT = V.getValueType(); 4326 unsigned NumElems = VT.getVectorNumElements(); 4327 4328 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4329 return SDValue(); 4330 } 4331 4332 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4333 return (Index == 0) ? V.getOperand(0) 4334 : DAG.getUNDEF(VT.getVectorElementType()); 4335 4336 if (V.getOpcode() == ISD::BUILD_VECTOR) 4337 return V.getOperand(Index); 4338 4339 return SDValue(); 4340} 4341 4342/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4343/// shuffle operation which come from a consecutively from a zero. The 4344/// search can start in two different directions, from left or right. 4345static 4346unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4347 bool ZerosFromLeft, SelectionDAG &DAG) { 4348 int i = 0; 4349 4350 while (i < NumElems) { 4351 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4352 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4353 if (!(Elt.getNode() && 4354 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4355 break; 4356 ++i; 4357 } 4358 4359 return i; 4360} 4361 4362/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4363/// MaskE correspond consecutively to elements from one of the vector operands, 4364/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4365static 4366bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4367 int OpIdx, int NumElems, unsigned &OpNum) { 4368 bool SeenV1 = false; 4369 bool SeenV2 = false; 4370 4371 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4372 int Idx = SVOp->getMaskElt(i); 4373 // Ignore undef indicies 4374 if (Idx < 0) 4375 continue; 4376 4377 if (Idx < NumElems) 4378 SeenV1 = true; 4379 else 4380 SeenV2 = true; 4381 4382 // Only accept consecutive elements from the same vector 4383 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4384 return false; 4385 } 4386 4387 OpNum = SeenV1 ? 0 : 1; 4388 return true; 4389} 4390 4391/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4392/// logical left shift of a vector. 4393static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4394 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4395 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4396 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4397 false /* check zeros from right */, DAG); 4398 unsigned OpSrc; 4399 4400 if (!NumZeros) 4401 return false; 4402 4403 // Considering the elements in the mask that are not consecutive zeros, 4404 // check if they consecutively come from only one of the source vectors. 4405 // 4406 // V1 = {X, A, B, C} 0 4407 // \ \ \ / 4408 // vector_shuffle V1, V2 <1, 2, 3, X> 4409 // 4410 if (!isShuffleMaskConsecutive(SVOp, 4411 0, // Mask Start Index 4412 NumElems-NumZeros-1, // Mask End Index 4413 NumZeros, // Where to start looking in the src vector 4414 NumElems, // Number of elements in vector 4415 OpSrc)) // Which source operand ? 4416 return false; 4417 4418 isLeft = false; 4419 ShAmt = NumZeros; 4420 ShVal = SVOp->getOperand(OpSrc); 4421 return true; 4422} 4423 4424/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4425/// logical left shift of a vector. 4426static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4427 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4428 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4429 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4430 true /* check zeros from left */, DAG); 4431 unsigned OpSrc; 4432 4433 if (!NumZeros) 4434 return false; 4435 4436 // Considering the elements in the mask that are not consecutive zeros, 4437 // check if they consecutively come from only one of the source vectors. 4438 // 4439 // 0 { A, B, X, X } = V2 4440 // / \ / / 4441 // vector_shuffle V1, V2 <X, X, 4, 5> 4442 // 4443 if (!isShuffleMaskConsecutive(SVOp, 4444 NumZeros, // Mask Start Index 4445 NumElems-1, // Mask End Index 4446 0, // Where to start looking in the src vector 4447 NumElems, // Number of elements in vector 4448 OpSrc)) // Which source operand ? 4449 return false; 4450 4451 isLeft = true; 4452 ShAmt = NumZeros; 4453 ShVal = SVOp->getOperand(OpSrc); 4454 return true; 4455} 4456 4457/// isVectorShift - Returns true if the shuffle can be implemented as a 4458/// logical left or right shift of a vector. 4459static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4460 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4461 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4462 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4463 return true; 4464 4465 return false; 4466} 4467 4468/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4469/// 4470static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4471 unsigned NumNonZero, unsigned NumZero, 4472 SelectionDAG &DAG, 4473 const TargetLowering &TLI) { 4474 if (NumNonZero > 8) 4475 return SDValue(); 4476 4477 DebugLoc dl = Op.getDebugLoc(); 4478 SDValue V(0, 0); 4479 bool First = true; 4480 for (unsigned i = 0; i < 16; ++i) { 4481 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4482 if (ThisIsNonZero && First) { 4483 if (NumZero) 4484 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4485 else 4486 V = DAG.getUNDEF(MVT::v8i16); 4487 First = false; 4488 } 4489 4490 if ((i & 1) != 0) { 4491 SDValue ThisElt(0, 0), LastElt(0, 0); 4492 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4493 if (LastIsNonZero) { 4494 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4495 MVT::i16, Op.getOperand(i-1)); 4496 } 4497 if (ThisIsNonZero) { 4498 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4499 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4500 ThisElt, DAG.getConstant(8, MVT::i8)); 4501 if (LastIsNonZero) 4502 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4503 } else 4504 ThisElt = LastElt; 4505 4506 if (ThisElt.getNode()) 4507 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4508 DAG.getIntPtrConstant(i/2)); 4509 } 4510 } 4511 4512 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4513} 4514 4515/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4516/// 4517static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4518 unsigned NumNonZero, unsigned NumZero, 4519 SelectionDAG &DAG, 4520 const TargetLowering &TLI) { 4521 if (NumNonZero > 4) 4522 return SDValue(); 4523 4524 DebugLoc dl = Op.getDebugLoc(); 4525 SDValue V(0, 0); 4526 bool First = true; 4527 for (unsigned i = 0; i < 8; ++i) { 4528 bool isNonZero = (NonZeros & (1 << i)) != 0; 4529 if (isNonZero) { 4530 if (First) { 4531 if (NumZero) 4532 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4533 else 4534 V = DAG.getUNDEF(MVT::v8i16); 4535 First = false; 4536 } 4537 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4538 MVT::v8i16, V, Op.getOperand(i), 4539 DAG.getIntPtrConstant(i)); 4540 } 4541 } 4542 4543 return V; 4544} 4545 4546/// getVShift - Return a vector logical shift node. 4547/// 4548static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4549 unsigned NumBits, SelectionDAG &DAG, 4550 const TargetLowering &TLI, DebugLoc dl) { 4551 EVT ShVT = MVT::v2i64; 4552 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4553 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4554 return DAG.getNode(ISD::BITCAST, dl, VT, 4555 DAG.getNode(Opc, dl, ShVT, SrcOp, 4556 DAG.getConstant(NumBits, 4557 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4558} 4559 4560SDValue 4561X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4562 SelectionDAG &DAG) const { 4563 4564 // Check if the scalar load can be widened into a vector load. And if 4565 // the address is "base + cst" see if the cst can be "absorbed" into 4566 // the shuffle mask. 4567 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4568 SDValue Ptr = LD->getBasePtr(); 4569 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4570 return SDValue(); 4571 EVT PVT = LD->getValueType(0); 4572 if (PVT != MVT::i32 && PVT != MVT::f32) 4573 return SDValue(); 4574 4575 int FI = -1; 4576 int64_t Offset = 0; 4577 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4578 FI = FINode->getIndex(); 4579 Offset = 0; 4580 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4581 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4582 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4583 Offset = Ptr.getConstantOperandVal(1); 4584 Ptr = Ptr.getOperand(0); 4585 } else { 4586 return SDValue(); 4587 } 4588 4589 // FIXME: 256-bit vector instructions don't require a strict alignment, 4590 // improve this code to support it better. 4591 unsigned RequiredAlign = VT.getSizeInBits()/8; 4592 SDValue Chain = LD->getChain(); 4593 // Make sure the stack object alignment is at least 16 or 32. 4594 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4595 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4596 if (MFI->isFixedObjectIndex(FI)) { 4597 // Can't change the alignment. FIXME: It's possible to compute 4598 // the exact stack offset and reference FI + adjust offset instead. 4599 // If someone *really* cares about this. That's the way to implement it. 4600 return SDValue(); 4601 } else { 4602 MFI->setObjectAlignment(FI, RequiredAlign); 4603 } 4604 } 4605 4606 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4607 // Ptr + (Offset & ~15). 4608 if (Offset < 0) 4609 return SDValue(); 4610 if ((Offset % RequiredAlign) & 3) 4611 return SDValue(); 4612 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4613 if (StartOffset) 4614 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4615 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4616 4617 int EltNo = (Offset - StartOffset) >> 2; 4618 int NumElems = VT.getVectorNumElements(); 4619 4620 EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; 4621 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4622 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4623 LD->getPointerInfo().getWithOffset(StartOffset), 4624 false, false, 0); 4625 4626 // Canonicalize it to a v4i32 or v8i32 shuffle. 4627 SmallVector<int, 8> Mask; 4628 for (int i = 0; i < NumElems; ++i) 4629 Mask.push_back(EltNo); 4630 4631 V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); 4632 return DAG.getNode(ISD::BITCAST, dl, NVT, 4633 DAG.getVectorShuffle(CanonVT, dl, V1, 4634 DAG.getUNDEF(CanonVT),&Mask[0])); 4635 } 4636 4637 return SDValue(); 4638} 4639 4640/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4641/// vector of type 'VT', see if the elements can be replaced by a single large 4642/// load which has the same value as a build_vector whose operands are 'elts'. 4643/// 4644/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4645/// 4646/// FIXME: we'd also like to handle the case where the last elements are zero 4647/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4648/// There's even a handy isZeroNode for that purpose. 4649static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4650 DebugLoc &DL, SelectionDAG &DAG) { 4651 EVT EltVT = VT.getVectorElementType(); 4652 unsigned NumElems = Elts.size(); 4653 4654 LoadSDNode *LDBase = NULL; 4655 unsigned LastLoadedElt = -1U; 4656 4657 // For each element in the initializer, see if we've found a load or an undef. 4658 // If we don't find an initial load element, or later load elements are 4659 // non-consecutive, bail out. 4660 for (unsigned i = 0; i < NumElems; ++i) { 4661 SDValue Elt = Elts[i]; 4662 4663 if (!Elt.getNode() || 4664 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4665 return SDValue(); 4666 if (!LDBase) { 4667 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4668 return SDValue(); 4669 LDBase = cast<LoadSDNode>(Elt.getNode()); 4670 LastLoadedElt = i; 4671 continue; 4672 } 4673 if (Elt.getOpcode() == ISD::UNDEF) 4674 continue; 4675 4676 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4677 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4678 return SDValue(); 4679 LastLoadedElt = i; 4680 } 4681 4682 // If we have found an entire vector of loads and undefs, then return a large 4683 // load of the entire vector width starting at the base pointer. If we found 4684 // consecutive loads for the low half, generate a vzext_load node. 4685 if (LastLoadedElt == NumElems - 1) { 4686 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4687 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4688 LDBase->getPointerInfo(), 4689 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4690 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4691 LDBase->getPointerInfo(), 4692 LDBase->isVolatile(), LDBase->isNonTemporal(), 4693 LDBase->getAlignment()); 4694 } else if (NumElems == 4 && LastLoadedElt == 1 && 4695 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4696 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4697 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4698 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4699 Ops, 2, MVT::i32, 4700 LDBase->getMemOperand()); 4701 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4702 } 4703 return SDValue(); 4704} 4705 4706SDValue 4707X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4708 DebugLoc dl = Op.getDebugLoc(); 4709 4710 EVT VT = Op.getValueType(); 4711 EVT ExtVT = VT.getVectorElementType(); 4712 unsigned NumElems = Op.getNumOperands(); 4713 4714 // Vectors containing all zeros can be matched by pxor and xorps later 4715 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 4716 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 4717 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 4718 if (Op.getValueType() == MVT::v4i32 || 4719 Op.getValueType() == MVT::v8i32) 4720 return Op; 4721 4722 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4723 } 4724 4725 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 4726 // vectors or broken into v4i32 operations on 256-bit vectors. 4727 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 4728 if (Op.getValueType() == MVT::v4i32) 4729 return Op; 4730 4731 return getOnesVector(Op.getValueType(), DAG, dl); 4732 } 4733 4734 unsigned EVTBits = ExtVT.getSizeInBits(); 4735 4736 unsigned NumZero = 0; 4737 unsigned NumNonZero = 0; 4738 unsigned NonZeros = 0; 4739 bool IsAllConstants = true; 4740 SmallSet<SDValue, 8> Values; 4741 for (unsigned i = 0; i < NumElems; ++i) { 4742 SDValue Elt = Op.getOperand(i); 4743 if (Elt.getOpcode() == ISD::UNDEF) 4744 continue; 4745 Values.insert(Elt); 4746 if (Elt.getOpcode() != ISD::Constant && 4747 Elt.getOpcode() != ISD::ConstantFP) 4748 IsAllConstants = false; 4749 if (X86::isZeroNode(Elt)) 4750 NumZero++; 4751 else { 4752 NonZeros |= (1 << i); 4753 NumNonZero++; 4754 } 4755 } 4756 4757 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4758 if (NumNonZero == 0) 4759 return DAG.getUNDEF(VT); 4760 4761 // Special case for single non-zero, non-undef, element. 4762 if (NumNonZero == 1) { 4763 unsigned Idx = CountTrailingZeros_32(NonZeros); 4764 SDValue Item = Op.getOperand(Idx); 4765 4766 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4767 // the value are obviously zero, truncate the value to i32 and do the 4768 // insertion that way. Only do this if the value is non-constant or if the 4769 // value is a constant being inserted into element 0. It is cheaper to do 4770 // a constant pool load than it is to do a movd + shuffle. 4771 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4772 (!IsAllConstants || Idx == 0)) { 4773 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4774 // Handle SSE only. 4775 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4776 EVT VecVT = MVT::v4i32; 4777 unsigned VecElts = 4; 4778 4779 // Truncate the value (which may itself be a constant) to i32, and 4780 // convert it to a vector with movd (S2V+shuffle to zero extend). 4781 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4782 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4783 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4784 Subtarget->hasSSE2(), DAG); 4785 4786 // Now we have our 32-bit value zero extended in the low element of 4787 // a vector. If Idx != 0, swizzle it into place. 4788 if (Idx != 0) { 4789 SmallVector<int, 4> Mask; 4790 Mask.push_back(Idx); 4791 for (unsigned i = 1; i != VecElts; ++i) 4792 Mask.push_back(i); 4793 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4794 DAG.getUNDEF(Item.getValueType()), 4795 &Mask[0]); 4796 } 4797 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4798 } 4799 } 4800 4801 // If we have a constant or non-constant insertion into the low element of 4802 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4803 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4804 // depending on what the source datatype is. 4805 if (Idx == 0) { 4806 if (NumZero == 0) { 4807 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4808 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4809 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4810 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4811 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4812 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4813 DAG); 4814 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4815 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4816 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4817 EVT MiddleVT = MVT::v4i32; 4818 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4819 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4820 Subtarget->hasSSE2(), DAG); 4821 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4822 } 4823 } 4824 4825 // Is it a vector logical left shift? 4826 if (NumElems == 2 && Idx == 1 && 4827 X86::isZeroNode(Op.getOperand(0)) && 4828 !X86::isZeroNode(Op.getOperand(1))) { 4829 unsigned NumBits = VT.getSizeInBits(); 4830 return getVShift(true, VT, 4831 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4832 VT, Op.getOperand(1)), 4833 NumBits/2, DAG, *this, dl); 4834 } 4835 4836 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4837 return SDValue(); 4838 4839 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4840 // is a non-constant being inserted into an element other than the low one, 4841 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4842 // movd/movss) to move this into the low element, then shuffle it into 4843 // place. 4844 if (EVTBits == 32) { 4845 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4846 4847 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4848 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4849 Subtarget->hasSSE2(), DAG); 4850 SmallVector<int, 8> MaskVec; 4851 for (unsigned i = 0; i < NumElems; i++) 4852 MaskVec.push_back(i == Idx ? 0 : 1); 4853 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4854 } 4855 } 4856 4857 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4858 if (Values.size() == 1) { 4859 if (EVTBits == 32) { 4860 // Instead of a shuffle like this: 4861 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4862 // Check if it's possible to issue this instead. 4863 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4864 unsigned Idx = CountTrailingZeros_32(NonZeros); 4865 SDValue Item = Op.getOperand(Idx); 4866 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4867 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4868 } 4869 return SDValue(); 4870 } 4871 4872 // A vector full of immediates; various special cases are already 4873 // handled, so this is best done with a single constant-pool load. 4874 if (IsAllConstants) 4875 return SDValue(); 4876 4877 // For AVX-length vectors, build the individual 128-bit pieces and use 4878 // shuffles to put them in place. 4879 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 4880 SmallVector<SDValue, 32> V; 4881 for (unsigned i = 0; i < NumElems; ++i) 4882 V.push_back(Op.getOperand(i)); 4883 4884 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4885 4886 // Build both the lower and upper subvector. 4887 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4888 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4889 NumElems/2); 4890 4891 // Recreate the wider vector with the lower and upper part. 4892 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 4893 DAG.getConstant(0, MVT::i32), DAG, dl); 4894 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 4895 DAG, dl); 4896 } 4897 4898 // Let legalizer expand 2-wide build_vectors. 4899 if (EVTBits == 64) { 4900 if (NumNonZero == 1) { 4901 // One half is zero or undef. 4902 unsigned Idx = CountTrailingZeros_32(NonZeros); 4903 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4904 Op.getOperand(Idx)); 4905 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4906 Subtarget->hasSSE2(), DAG); 4907 } 4908 return SDValue(); 4909 } 4910 4911 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4912 if (EVTBits == 8 && NumElems == 16) { 4913 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4914 *this); 4915 if (V.getNode()) return V; 4916 } 4917 4918 if (EVTBits == 16 && NumElems == 8) { 4919 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4920 *this); 4921 if (V.getNode()) return V; 4922 } 4923 4924 // If element VT is == 32 bits, turn it into a number of shuffles. 4925 SmallVector<SDValue, 8> V; 4926 V.resize(NumElems); 4927 if (NumElems == 4 && NumZero > 0) { 4928 for (unsigned i = 0; i < 4; ++i) { 4929 bool isZero = !(NonZeros & (1 << i)); 4930 if (isZero) 4931 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4932 else 4933 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4934 } 4935 4936 for (unsigned i = 0; i < 2; ++i) { 4937 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4938 default: break; 4939 case 0: 4940 V[i] = V[i*2]; // Must be a zero vector. 4941 break; 4942 case 1: 4943 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4944 break; 4945 case 2: 4946 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4947 break; 4948 case 3: 4949 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4950 break; 4951 } 4952 } 4953 4954 SmallVector<int, 8> MaskVec; 4955 bool Reverse = (NonZeros & 0x3) == 2; 4956 for (unsigned i = 0; i < 2; ++i) 4957 MaskVec.push_back(Reverse ? 1-i : i); 4958 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4959 for (unsigned i = 0; i < 2; ++i) 4960 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4961 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4962 } 4963 4964 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4965 // Check for a build vector of consecutive loads. 4966 for (unsigned i = 0; i < NumElems; ++i) 4967 V[i] = Op.getOperand(i); 4968 4969 // Check for elements which are consecutive loads. 4970 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4971 if (LD.getNode()) 4972 return LD; 4973 4974 // For SSE 4.1, use insertps to put the high elements into the low element. 4975 if (getSubtarget()->hasSSE41()) { 4976 SDValue Result; 4977 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4978 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4979 else 4980 Result = DAG.getUNDEF(VT); 4981 4982 for (unsigned i = 1; i < NumElems; ++i) { 4983 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4984 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4985 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4986 } 4987 return Result; 4988 } 4989 4990 // Otherwise, expand into a number of unpckl*, start by extending each of 4991 // our (non-undef) elements to the full vector width with the element in the 4992 // bottom slot of the vector (which generates no code for SSE). 4993 for (unsigned i = 0; i < NumElems; ++i) { 4994 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4995 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4996 else 4997 V[i] = DAG.getUNDEF(VT); 4998 } 4999 5000 // Next, we iteratively mix elements, e.g. for v4f32: 5001 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5002 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5003 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5004 unsigned EltStride = NumElems >> 1; 5005 while (EltStride != 0) { 5006 for (unsigned i = 0; i < EltStride; ++i) { 5007 // If V[i+EltStride] is undef and this is the first round of mixing, 5008 // then it is safe to just drop this shuffle: V[i] is already in the 5009 // right place, the one element (since it's the first round) being 5010 // inserted as undef can be dropped. This isn't safe for successive 5011 // rounds because they will permute elements within both vectors. 5012 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5013 EltStride == NumElems/2) 5014 continue; 5015 5016 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5017 } 5018 EltStride >>= 1; 5019 } 5020 return V[0]; 5021 } 5022 return SDValue(); 5023} 5024 5025// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place 5026// them in a MMX register. This is better than doing a stack convert. 5027static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5028 DebugLoc dl = Op.getDebugLoc(); 5029 EVT ResVT = Op.getValueType(); 5030 5031 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 5032 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 5033 int Mask[2]; 5034 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 5035 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5036 InVec = Op.getOperand(1); 5037 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5038 unsigned NumElts = ResVT.getVectorNumElements(); 5039 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5040 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 5041 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 5042 } else { 5043 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 5044 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5045 Mask[0] = 0; Mask[1] = 2; 5046 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 5047 } 5048 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5049} 5050 5051// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5052// to create 256-bit vectors from two other 128-bit ones. 5053static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5054 DebugLoc dl = Op.getDebugLoc(); 5055 EVT ResVT = Op.getValueType(); 5056 5057 assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); 5058 5059 SDValue V1 = Op.getOperand(0); 5060 SDValue V2 = Op.getOperand(1); 5061 unsigned NumElems = ResVT.getVectorNumElements(); 5062 5063 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, 5064 DAG.getConstant(0, MVT::i32), DAG, dl); 5065 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5066 DAG, dl); 5067} 5068 5069SDValue 5070X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5071 EVT ResVT = Op.getValueType(); 5072 5073 assert(Op.getNumOperands() == 2); 5074 assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && 5075 "Unsupported CONCAT_VECTORS for value type"); 5076 5077 // We support concatenate two MMX registers and place them in a MMX register. 5078 // This is better than doing a stack convert. 5079 if (ResVT.is128BitVector()) 5080 return LowerMMXCONCAT_VECTORS(Op, DAG); 5081 5082 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5083 // from two other 128-bit ones. 5084 return LowerAVXCONCAT_VECTORS(Op, DAG); 5085} 5086 5087// v8i16 shuffles - Prefer shuffles in the following order: 5088// 1. [all] pshuflw, pshufhw, optional move 5089// 2. [ssse3] 1 x pshufb 5090// 3. [ssse3] 2 x pshufb + 1 x por 5091// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5092SDValue 5093X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5094 SelectionDAG &DAG) const { 5095 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5096 SDValue V1 = SVOp->getOperand(0); 5097 SDValue V2 = SVOp->getOperand(1); 5098 DebugLoc dl = SVOp->getDebugLoc(); 5099 SmallVector<int, 8> MaskVals; 5100 5101 // Determine if more than 1 of the words in each of the low and high quadwords 5102 // of the result come from the same quadword of one of the two inputs. Undef 5103 // mask values count as coming from any quadword, for better codegen. 5104 SmallVector<unsigned, 4> LoQuad(4); 5105 SmallVector<unsigned, 4> HiQuad(4); 5106 BitVector InputQuads(4); 5107 for (unsigned i = 0; i < 8; ++i) { 5108 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 5109 int EltIdx = SVOp->getMaskElt(i); 5110 MaskVals.push_back(EltIdx); 5111 if (EltIdx < 0) { 5112 ++Quad[0]; 5113 ++Quad[1]; 5114 ++Quad[2]; 5115 ++Quad[3]; 5116 continue; 5117 } 5118 ++Quad[EltIdx / 4]; 5119 InputQuads.set(EltIdx / 4); 5120 } 5121 5122 int BestLoQuad = -1; 5123 unsigned MaxQuad = 1; 5124 for (unsigned i = 0; i < 4; ++i) { 5125 if (LoQuad[i] > MaxQuad) { 5126 BestLoQuad = i; 5127 MaxQuad = LoQuad[i]; 5128 } 5129 } 5130 5131 int BestHiQuad = -1; 5132 MaxQuad = 1; 5133 for (unsigned i = 0; i < 4; ++i) { 5134 if (HiQuad[i] > MaxQuad) { 5135 BestHiQuad = i; 5136 MaxQuad = HiQuad[i]; 5137 } 5138 } 5139 5140 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5141 // of the two input vectors, shuffle them into one input vector so only a 5142 // single pshufb instruction is necessary. If There are more than 2 input 5143 // quads, disable the next transformation since it does not help SSSE3. 5144 bool V1Used = InputQuads[0] || InputQuads[1]; 5145 bool V2Used = InputQuads[2] || InputQuads[3]; 5146 if (Subtarget->hasSSSE3()) { 5147 if (InputQuads.count() == 2 && V1Used && V2Used) { 5148 BestLoQuad = InputQuads.find_first(); 5149 BestHiQuad = InputQuads.find_next(BestLoQuad); 5150 } 5151 if (InputQuads.count() > 2) { 5152 BestLoQuad = -1; 5153 BestHiQuad = -1; 5154 } 5155 } 5156 5157 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5158 // the shuffle mask. If a quad is scored as -1, that means that it contains 5159 // words from all 4 input quadwords. 5160 SDValue NewV; 5161 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5162 SmallVector<int, 8> MaskV; 5163 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 5164 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 5165 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5166 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5167 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5168 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5169 5170 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5171 // source words for the shuffle, to aid later transformations. 5172 bool AllWordsInNewV = true; 5173 bool InOrder[2] = { true, true }; 5174 for (unsigned i = 0; i != 8; ++i) { 5175 int idx = MaskVals[i]; 5176 if (idx != (int)i) 5177 InOrder[i/4] = false; 5178 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5179 continue; 5180 AllWordsInNewV = false; 5181 break; 5182 } 5183 5184 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5185 if (AllWordsInNewV) { 5186 for (int i = 0; i != 8; ++i) { 5187 int idx = MaskVals[i]; 5188 if (idx < 0) 5189 continue; 5190 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5191 if ((idx != i) && idx < 4) 5192 pshufhw = false; 5193 if ((idx != i) && idx > 3) 5194 pshuflw = false; 5195 } 5196 V1 = NewV; 5197 V2Used = false; 5198 BestLoQuad = 0; 5199 BestHiQuad = 1; 5200 } 5201 5202 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5203 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5204 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5205 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5206 unsigned TargetMask = 0; 5207 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5208 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5209 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5210 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5211 V1 = NewV.getOperand(0); 5212 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5213 } 5214 } 5215 5216 // If we have SSSE3, and all words of the result are from 1 input vector, 5217 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5218 // is present, fall back to case 4. 5219 if (Subtarget->hasSSSE3()) { 5220 SmallVector<SDValue,16> pshufbMask; 5221 5222 // If we have elements from both input vectors, set the high bit of the 5223 // shuffle mask element to zero out elements that come from V2 in the V1 5224 // mask, and elements that come from V1 in the V2 mask, so that the two 5225 // results can be OR'd together. 5226 bool TwoInputs = V1Used && V2Used; 5227 for (unsigned i = 0; i != 8; ++i) { 5228 int EltIdx = MaskVals[i] * 2; 5229 if (TwoInputs && (EltIdx >= 16)) { 5230 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5231 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5232 continue; 5233 } 5234 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5235 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5236 } 5237 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5238 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5239 DAG.getNode(ISD::BUILD_VECTOR, dl, 5240 MVT::v16i8, &pshufbMask[0], 16)); 5241 if (!TwoInputs) 5242 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5243 5244 // Calculate the shuffle mask for the second input, shuffle it, and 5245 // OR it with the first shuffled input. 5246 pshufbMask.clear(); 5247 for (unsigned i = 0; i != 8; ++i) { 5248 int EltIdx = MaskVals[i] * 2; 5249 if (EltIdx < 16) { 5250 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5251 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5252 continue; 5253 } 5254 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5255 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5256 } 5257 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5258 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5259 DAG.getNode(ISD::BUILD_VECTOR, dl, 5260 MVT::v16i8, &pshufbMask[0], 16)); 5261 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5262 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5263 } 5264 5265 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5266 // and update MaskVals with new element order. 5267 BitVector InOrder(8); 5268 if (BestLoQuad >= 0) { 5269 SmallVector<int, 8> MaskV; 5270 for (int i = 0; i != 4; ++i) { 5271 int idx = MaskVals[i]; 5272 if (idx < 0) { 5273 MaskV.push_back(-1); 5274 InOrder.set(i); 5275 } else if ((idx / 4) == BestLoQuad) { 5276 MaskV.push_back(idx & 3); 5277 InOrder.set(i); 5278 } else { 5279 MaskV.push_back(-1); 5280 } 5281 } 5282 for (unsigned i = 4; i != 8; ++i) 5283 MaskV.push_back(i); 5284 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5285 &MaskV[0]); 5286 5287 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5288 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5289 NewV.getOperand(0), 5290 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5291 DAG); 5292 } 5293 5294 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5295 // and update MaskVals with the new element order. 5296 if (BestHiQuad >= 0) { 5297 SmallVector<int, 8> MaskV; 5298 for (unsigned i = 0; i != 4; ++i) 5299 MaskV.push_back(i); 5300 for (unsigned i = 4; i != 8; ++i) { 5301 int idx = MaskVals[i]; 5302 if (idx < 0) { 5303 MaskV.push_back(-1); 5304 InOrder.set(i); 5305 } else if ((idx / 4) == BestHiQuad) { 5306 MaskV.push_back((idx & 3) + 4); 5307 InOrder.set(i); 5308 } else { 5309 MaskV.push_back(-1); 5310 } 5311 } 5312 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5313 &MaskV[0]); 5314 5315 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5316 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5317 NewV.getOperand(0), 5318 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5319 DAG); 5320 } 5321 5322 // In case BestHi & BestLo were both -1, which means each quadword has a word 5323 // from each of the four input quadwords, calculate the InOrder bitvector now 5324 // before falling through to the insert/extract cleanup. 5325 if (BestLoQuad == -1 && BestHiQuad == -1) { 5326 NewV = V1; 5327 for (int i = 0; i != 8; ++i) 5328 if (MaskVals[i] < 0 || MaskVals[i] == i) 5329 InOrder.set(i); 5330 } 5331 5332 // The other elements are put in the right place using pextrw and pinsrw. 5333 for (unsigned i = 0; i != 8; ++i) { 5334 if (InOrder[i]) 5335 continue; 5336 int EltIdx = MaskVals[i]; 5337 if (EltIdx < 0) 5338 continue; 5339 SDValue ExtOp = (EltIdx < 8) 5340 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5341 DAG.getIntPtrConstant(EltIdx)) 5342 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5343 DAG.getIntPtrConstant(EltIdx - 8)); 5344 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5345 DAG.getIntPtrConstant(i)); 5346 } 5347 return NewV; 5348} 5349 5350// v16i8 shuffles - Prefer shuffles in the following order: 5351// 1. [ssse3] 1 x pshufb 5352// 2. [ssse3] 2 x pshufb + 1 x por 5353// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5354static 5355SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5356 SelectionDAG &DAG, 5357 const X86TargetLowering &TLI) { 5358 SDValue V1 = SVOp->getOperand(0); 5359 SDValue V2 = SVOp->getOperand(1); 5360 DebugLoc dl = SVOp->getDebugLoc(); 5361 SmallVector<int, 16> MaskVals; 5362 SVOp->getMask(MaskVals); 5363 5364 // If we have SSSE3, case 1 is generated when all result bytes come from 5365 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5366 // present, fall back to case 3. 5367 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5368 bool V1Only = true; 5369 bool V2Only = true; 5370 for (unsigned i = 0; i < 16; ++i) { 5371 int EltIdx = MaskVals[i]; 5372 if (EltIdx < 0) 5373 continue; 5374 if (EltIdx < 16) 5375 V2Only = false; 5376 else 5377 V1Only = false; 5378 } 5379 5380 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5381 if (TLI.getSubtarget()->hasSSSE3()) { 5382 SmallVector<SDValue,16> pshufbMask; 5383 5384 // If all result elements are from one input vector, then only translate 5385 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5386 // 5387 // Otherwise, we have elements from both input vectors, and must zero out 5388 // elements that come from V2 in the first mask, and V1 in the second mask 5389 // so that we can OR them together. 5390 bool TwoInputs = !(V1Only || V2Only); 5391 for (unsigned i = 0; i != 16; ++i) { 5392 int EltIdx = MaskVals[i]; 5393 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5394 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5395 continue; 5396 } 5397 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5398 } 5399 // If all the elements are from V2, assign it to V1 and return after 5400 // building the first pshufb. 5401 if (V2Only) 5402 V1 = V2; 5403 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5404 DAG.getNode(ISD::BUILD_VECTOR, dl, 5405 MVT::v16i8, &pshufbMask[0], 16)); 5406 if (!TwoInputs) 5407 return V1; 5408 5409 // Calculate the shuffle mask for the second input, shuffle it, and 5410 // OR it with the first shuffled input. 5411 pshufbMask.clear(); 5412 for (unsigned i = 0; i != 16; ++i) { 5413 int EltIdx = MaskVals[i]; 5414 if (EltIdx < 16) { 5415 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5416 continue; 5417 } 5418 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5419 } 5420 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5421 DAG.getNode(ISD::BUILD_VECTOR, dl, 5422 MVT::v16i8, &pshufbMask[0], 16)); 5423 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5424 } 5425 5426 // No SSSE3 - Calculate in place words and then fix all out of place words 5427 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5428 // the 16 different words that comprise the two doublequadword input vectors. 5429 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5430 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5431 SDValue NewV = V2Only ? V2 : V1; 5432 for (int i = 0; i != 8; ++i) { 5433 int Elt0 = MaskVals[i*2]; 5434 int Elt1 = MaskVals[i*2+1]; 5435 5436 // This word of the result is all undef, skip it. 5437 if (Elt0 < 0 && Elt1 < 0) 5438 continue; 5439 5440 // This word of the result is already in the correct place, skip it. 5441 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5442 continue; 5443 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5444 continue; 5445 5446 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5447 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5448 SDValue InsElt; 5449 5450 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5451 // using a single extract together, load it and store it. 5452 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5453 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5454 DAG.getIntPtrConstant(Elt1 / 2)); 5455 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5456 DAG.getIntPtrConstant(i)); 5457 continue; 5458 } 5459 5460 // If Elt1 is defined, extract it from the appropriate source. If the 5461 // source byte is not also odd, shift the extracted word left 8 bits 5462 // otherwise clear the bottom 8 bits if we need to do an or. 5463 if (Elt1 >= 0) { 5464 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5465 DAG.getIntPtrConstant(Elt1 / 2)); 5466 if ((Elt1 & 1) == 0) 5467 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5468 DAG.getConstant(8, 5469 TLI.getShiftAmountTy(InsElt.getValueType()))); 5470 else if (Elt0 >= 0) 5471 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5472 DAG.getConstant(0xFF00, MVT::i16)); 5473 } 5474 // If Elt0 is defined, extract it from the appropriate source. If the 5475 // source byte is not also even, shift the extracted word right 8 bits. If 5476 // Elt1 was also defined, OR the extracted values together before 5477 // inserting them in the result. 5478 if (Elt0 >= 0) { 5479 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5480 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5481 if ((Elt0 & 1) != 0) 5482 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5483 DAG.getConstant(8, 5484 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5485 else if (Elt1 >= 0) 5486 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5487 DAG.getConstant(0x00FF, MVT::i16)); 5488 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5489 : InsElt0; 5490 } 5491 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5492 DAG.getIntPtrConstant(i)); 5493 } 5494 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5495} 5496 5497/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5498/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5499/// done when every pair / quad of shuffle mask elements point to elements in 5500/// the right sequence. e.g. 5501/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5502static 5503SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5504 SelectionDAG &DAG, DebugLoc dl) { 5505 EVT VT = SVOp->getValueType(0); 5506 SDValue V1 = SVOp->getOperand(0); 5507 SDValue V2 = SVOp->getOperand(1); 5508 unsigned NumElems = VT.getVectorNumElements(); 5509 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5510 EVT NewVT; 5511 switch (VT.getSimpleVT().SimpleTy) { 5512 default: assert(false && "Unexpected!"); 5513 case MVT::v4f32: NewVT = MVT::v2f64; break; 5514 case MVT::v4i32: NewVT = MVT::v2i64; break; 5515 case MVT::v8i16: NewVT = MVT::v4i32; break; 5516 case MVT::v16i8: NewVT = MVT::v4i32; break; 5517 } 5518 5519 int Scale = NumElems / NewWidth; 5520 SmallVector<int, 8> MaskVec; 5521 for (unsigned i = 0; i < NumElems; i += Scale) { 5522 int StartIdx = -1; 5523 for (int j = 0; j < Scale; ++j) { 5524 int EltIdx = SVOp->getMaskElt(i+j); 5525 if (EltIdx < 0) 5526 continue; 5527 if (StartIdx == -1) 5528 StartIdx = EltIdx - (EltIdx % Scale); 5529 if (EltIdx != StartIdx + j) 5530 return SDValue(); 5531 } 5532 if (StartIdx == -1) 5533 MaskVec.push_back(-1); 5534 else 5535 MaskVec.push_back(StartIdx / Scale); 5536 } 5537 5538 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5539 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5540 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5541} 5542 5543/// getVZextMovL - Return a zero-extending vector move low node. 5544/// 5545static SDValue getVZextMovL(EVT VT, EVT OpVT, 5546 SDValue SrcOp, SelectionDAG &DAG, 5547 const X86Subtarget *Subtarget, DebugLoc dl) { 5548 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5549 LoadSDNode *LD = NULL; 5550 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5551 LD = dyn_cast<LoadSDNode>(SrcOp); 5552 if (!LD) { 5553 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5554 // instead. 5555 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5556 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5557 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5558 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5559 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5560 // PR2108 5561 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5562 return DAG.getNode(ISD::BITCAST, dl, VT, 5563 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5564 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5565 OpVT, 5566 SrcOp.getOperand(0) 5567 .getOperand(0)))); 5568 } 5569 } 5570 } 5571 5572 return DAG.getNode(ISD::BITCAST, dl, VT, 5573 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5574 DAG.getNode(ISD::BITCAST, dl, 5575 OpVT, SrcOp))); 5576} 5577 5578/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5579/// which could not be matched by any known target speficic shuffle 5580static SDValue 5581LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5582 return SDValue(); 5583} 5584 5585/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5586/// 4 elements, and match them with several different shuffle types. 5587static SDValue 5588LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5589 SDValue V1 = SVOp->getOperand(0); 5590 SDValue V2 = SVOp->getOperand(1); 5591 DebugLoc dl = SVOp->getDebugLoc(); 5592 EVT VT = SVOp->getValueType(0); 5593 5594 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5595 5596 SmallVector<std::pair<int, int>, 8> Locs; 5597 Locs.resize(4); 5598 SmallVector<int, 8> Mask1(4U, -1); 5599 SmallVector<int, 8> PermMask; 5600 SVOp->getMask(PermMask); 5601 5602 unsigned NumHi = 0; 5603 unsigned NumLo = 0; 5604 for (unsigned i = 0; i != 4; ++i) { 5605 int Idx = PermMask[i]; 5606 if (Idx < 0) { 5607 Locs[i] = std::make_pair(-1, -1); 5608 } else { 5609 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5610 if (Idx < 4) { 5611 Locs[i] = std::make_pair(0, NumLo); 5612 Mask1[NumLo] = Idx; 5613 NumLo++; 5614 } else { 5615 Locs[i] = std::make_pair(1, NumHi); 5616 if (2+NumHi < 4) 5617 Mask1[2+NumHi] = Idx; 5618 NumHi++; 5619 } 5620 } 5621 } 5622 5623 if (NumLo <= 2 && NumHi <= 2) { 5624 // If no more than two elements come from either vector. This can be 5625 // implemented with two shuffles. First shuffle gather the elements. 5626 // The second shuffle, which takes the first shuffle as both of its 5627 // vector operands, put the elements into the right order. 5628 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5629 5630 SmallVector<int, 8> Mask2(4U, -1); 5631 5632 for (unsigned i = 0; i != 4; ++i) { 5633 if (Locs[i].first == -1) 5634 continue; 5635 else { 5636 unsigned Idx = (i < 2) ? 0 : 4; 5637 Idx += Locs[i].first * 2 + Locs[i].second; 5638 Mask2[i] = Idx; 5639 } 5640 } 5641 5642 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5643 } else if (NumLo == 3 || NumHi == 3) { 5644 // Otherwise, we must have three elements from one vector, call it X, and 5645 // one element from the other, call it Y. First, use a shufps to build an 5646 // intermediate vector with the one element from Y and the element from X 5647 // that will be in the same half in the final destination (the indexes don't 5648 // matter). Then, use a shufps to build the final vector, taking the half 5649 // containing the element from Y from the intermediate, and the other half 5650 // from X. 5651 if (NumHi == 3) { 5652 // Normalize it so the 3 elements come from V1. 5653 CommuteVectorShuffleMask(PermMask, VT); 5654 std::swap(V1, V2); 5655 } 5656 5657 // Find the element from V2. 5658 unsigned HiIndex; 5659 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5660 int Val = PermMask[HiIndex]; 5661 if (Val < 0) 5662 continue; 5663 if (Val >= 4) 5664 break; 5665 } 5666 5667 Mask1[0] = PermMask[HiIndex]; 5668 Mask1[1] = -1; 5669 Mask1[2] = PermMask[HiIndex^1]; 5670 Mask1[3] = -1; 5671 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5672 5673 if (HiIndex >= 2) { 5674 Mask1[0] = PermMask[0]; 5675 Mask1[1] = PermMask[1]; 5676 Mask1[2] = HiIndex & 1 ? 6 : 4; 5677 Mask1[3] = HiIndex & 1 ? 4 : 6; 5678 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5679 } else { 5680 Mask1[0] = HiIndex & 1 ? 2 : 0; 5681 Mask1[1] = HiIndex & 1 ? 0 : 2; 5682 Mask1[2] = PermMask[2]; 5683 Mask1[3] = PermMask[3]; 5684 if (Mask1[2] >= 0) 5685 Mask1[2] += 4; 5686 if (Mask1[3] >= 0) 5687 Mask1[3] += 4; 5688 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5689 } 5690 } 5691 5692 // Break it into (shuffle shuffle_hi, shuffle_lo). 5693 Locs.clear(); 5694 Locs.resize(4); 5695 SmallVector<int,8> LoMask(4U, -1); 5696 SmallVector<int,8> HiMask(4U, -1); 5697 5698 SmallVector<int,8> *MaskPtr = &LoMask; 5699 unsigned MaskIdx = 0; 5700 unsigned LoIdx = 0; 5701 unsigned HiIdx = 2; 5702 for (unsigned i = 0; i != 4; ++i) { 5703 if (i == 2) { 5704 MaskPtr = &HiMask; 5705 MaskIdx = 1; 5706 LoIdx = 0; 5707 HiIdx = 2; 5708 } 5709 int Idx = PermMask[i]; 5710 if (Idx < 0) { 5711 Locs[i] = std::make_pair(-1, -1); 5712 } else if (Idx < 4) { 5713 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5714 (*MaskPtr)[LoIdx] = Idx; 5715 LoIdx++; 5716 } else { 5717 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5718 (*MaskPtr)[HiIdx] = Idx; 5719 HiIdx++; 5720 } 5721 } 5722 5723 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5724 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5725 SmallVector<int, 8> MaskOps; 5726 for (unsigned i = 0; i != 4; ++i) { 5727 if (Locs[i].first == -1) { 5728 MaskOps.push_back(-1); 5729 } else { 5730 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5731 MaskOps.push_back(Idx); 5732 } 5733 } 5734 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5735} 5736 5737static bool MayFoldVectorLoad(SDValue V) { 5738 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5739 V = V.getOperand(0); 5740 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5741 V = V.getOperand(0); 5742 if (MayFoldLoad(V)) 5743 return true; 5744 return false; 5745} 5746 5747// FIXME: the version above should always be used. Since there's 5748// a bug where several vector shuffles can't be folded because the 5749// DAG is not updated during lowering and a node claims to have two 5750// uses while it only has one, use this version, and let isel match 5751// another instruction if the load really happens to have more than 5752// one use. Remove this version after this bug get fixed. 5753// rdar://8434668, PR8156 5754static bool RelaxedMayFoldVectorLoad(SDValue V) { 5755 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5756 V = V.getOperand(0); 5757 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5758 V = V.getOperand(0); 5759 if (ISD::isNormalLoad(V.getNode())) 5760 return true; 5761 return false; 5762} 5763 5764/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5765/// a vector extract, and if both can be later optimized into a single load. 5766/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5767/// here because otherwise a target specific shuffle node is going to be 5768/// emitted for this shuffle, and the optimization not done. 5769/// FIXME: This is probably not the best approach, but fix the problem 5770/// until the right path is decided. 5771static 5772bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5773 const TargetLowering &TLI) { 5774 EVT VT = V.getValueType(); 5775 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5776 5777 // Be sure that the vector shuffle is present in a pattern like this: 5778 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5779 if (!V.hasOneUse()) 5780 return false; 5781 5782 SDNode *N = *V.getNode()->use_begin(); 5783 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5784 return false; 5785 5786 SDValue EltNo = N->getOperand(1); 5787 if (!isa<ConstantSDNode>(EltNo)) 5788 return false; 5789 5790 // If the bit convert changed the number of elements, it is unsafe 5791 // to examine the mask. 5792 bool HasShuffleIntoBitcast = false; 5793 if (V.getOpcode() == ISD::BITCAST) { 5794 EVT SrcVT = V.getOperand(0).getValueType(); 5795 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5796 return false; 5797 V = V.getOperand(0); 5798 HasShuffleIntoBitcast = true; 5799 } 5800 5801 // Select the input vector, guarding against out of range extract vector. 5802 unsigned NumElems = VT.getVectorNumElements(); 5803 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5804 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5805 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5806 5807 // Skip one more bit_convert if necessary 5808 if (V.getOpcode() == ISD::BITCAST) 5809 V = V.getOperand(0); 5810 5811 if (ISD::isNormalLoad(V.getNode())) { 5812 // Is the original load suitable? 5813 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5814 5815 // FIXME: avoid the multi-use bug that is preventing lots of 5816 // of foldings to be detected, this is still wrong of course, but 5817 // give the temporary desired behavior, and if it happens that 5818 // the load has real more uses, during isel it will not fold, and 5819 // will generate poor code. 5820 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5821 return false; 5822 5823 if (!HasShuffleIntoBitcast) 5824 return true; 5825 5826 // If there's a bitcast before the shuffle, check if the load type and 5827 // alignment is valid. 5828 unsigned Align = LN0->getAlignment(); 5829 unsigned NewAlign = 5830 TLI.getTargetData()->getABITypeAlignment( 5831 VT.getTypeForEVT(*DAG.getContext())); 5832 5833 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5834 return false; 5835 } 5836 5837 return true; 5838} 5839 5840static 5841SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5842 EVT VT = Op.getValueType(); 5843 5844 // Canonizalize to v2f64. 5845 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5846 return DAG.getNode(ISD::BITCAST, dl, VT, 5847 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5848 V1, DAG)); 5849} 5850 5851static 5852SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5853 bool HasSSE2) { 5854 SDValue V1 = Op.getOperand(0); 5855 SDValue V2 = Op.getOperand(1); 5856 EVT VT = Op.getValueType(); 5857 5858 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5859 5860 if (HasSSE2 && VT == MVT::v2f64) 5861 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5862 5863 // v4f32 or v4i32 5864 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5865} 5866 5867static 5868SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5869 SDValue V1 = Op.getOperand(0); 5870 SDValue V2 = Op.getOperand(1); 5871 EVT VT = Op.getValueType(); 5872 5873 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5874 "unsupported shuffle type"); 5875 5876 if (V2.getOpcode() == ISD::UNDEF) 5877 V2 = V1; 5878 5879 // v4i32 or v4f32 5880 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5881} 5882 5883static 5884SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5885 SDValue V1 = Op.getOperand(0); 5886 SDValue V2 = Op.getOperand(1); 5887 EVT VT = Op.getValueType(); 5888 unsigned NumElems = VT.getVectorNumElements(); 5889 5890 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5891 // operand of these instructions is only memory, so check if there's a 5892 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5893 // same masks. 5894 bool CanFoldLoad = false; 5895 5896 // Trivial case, when V2 comes from a load. 5897 if (MayFoldVectorLoad(V2)) 5898 CanFoldLoad = true; 5899 5900 // When V1 is a load, it can be folded later into a store in isel, example: 5901 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5902 // turns into: 5903 // (MOVLPSmr addr:$src1, VR128:$src2) 5904 // So, recognize this potential and also use MOVLPS or MOVLPD 5905 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5906 CanFoldLoad = true; 5907 5908 // Both of them can't be memory operations though. 5909 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 5910 CanFoldLoad = false; 5911 5912 if (CanFoldLoad) { 5913 if (HasSSE2 && NumElems == 2) 5914 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5915 5916 if (NumElems == 4) 5917 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5918 } 5919 5920 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5921 // movl and movlp will both match v2i64, but v2i64 is never matched by 5922 // movl earlier because we make it strict to avoid messing with the movlp load 5923 // folding logic (see the code above getMOVLP call). Match it here then, 5924 // this is horrible, but will stay like this until we move all shuffle 5925 // matching to x86 specific nodes. Note that for the 1st condition all 5926 // types are matched with movsd. 5927 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5928 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5929 else if (HasSSE2) 5930 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5931 5932 5933 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5934 5935 // Invert the operand order and use SHUFPS to match it. 5936 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5937 X86::getShuffleSHUFImmediate(SVOp), DAG); 5938} 5939 5940static inline unsigned getUNPCKLOpcode(EVT VT) { 5941 switch(VT.getSimpleVT().SimpleTy) { 5942 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5943 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5944 case MVT::v4f32: return X86ISD::UNPCKLPS; 5945 case MVT::v2f64: return X86ISD::UNPCKLPD; 5946 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 5947 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 5948 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5949 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5950 default: 5951 llvm_unreachable("Unknown type for unpckl"); 5952 } 5953 return 0; 5954} 5955 5956static inline unsigned getUNPCKHOpcode(EVT VT) { 5957 switch(VT.getSimpleVT().SimpleTy) { 5958 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5959 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5960 case MVT::v4f32: return X86ISD::UNPCKHPS; 5961 case MVT::v2f64: return X86ISD::UNPCKHPD; 5962 case MVT::v8f32: return X86ISD::VUNPCKHPSY; 5963 case MVT::v4f64: return X86ISD::VUNPCKHPDY; 5964 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5965 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5966 default: 5967 llvm_unreachable("Unknown type for unpckh"); 5968 } 5969 return 0; 5970} 5971 5972static inline unsigned getVPERMILOpcode(EVT VT) { 5973 switch(VT.getSimpleVT().SimpleTy) { 5974 case MVT::v4i32: 5975 case MVT::v4f32: return X86ISD::VPERMILPS; 5976 case MVT::v2i64: 5977 case MVT::v2f64: return X86ISD::VPERMILPD; 5978 case MVT::v8i32: 5979 case MVT::v8f32: return X86ISD::VPERMILPSY; 5980 case MVT::v4i64: 5981 case MVT::v4f64: return X86ISD::VPERMILPDY; 5982 default: 5983 llvm_unreachable("Unknown type for vpermil"); 5984 } 5985 return 0; 5986} 5987 5988static 5989SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5990 const TargetLowering &TLI, 5991 const X86Subtarget *Subtarget) { 5992 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5993 EVT VT = Op.getValueType(); 5994 DebugLoc dl = Op.getDebugLoc(); 5995 SDValue V1 = Op.getOperand(0); 5996 SDValue V2 = Op.getOperand(1); 5997 5998 if (isZeroShuffle(SVOp)) 5999 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 6000 6001 // Handle splat operations 6002 if (SVOp->isSplat()) { 6003 unsigned NumElem = VT.getVectorNumElements(); 6004 // Special case, this is the only place now where it's allowed to return 6005 // a vector_shuffle operation without using a target specific node, because 6006 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 6007 // this be moved to DAGCombine instead? 6008 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 6009 return Op; 6010 6011 // Since there's no native support for scalar_to_vector for 256-bit AVX, a 6012 // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this 6013 // idiom and do the shuffle before the insertion, this yields less 6014 // instructions in the end. 6015 if (VT.is256BitVector() && 6016 V1.getOpcode() == ISD::INSERT_SUBVECTOR && 6017 V1.getOperand(0).getOpcode() == ISD::UNDEF && 6018 V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR) 6019 return PromoteVectorToScalarSplat(SVOp, DAG); 6020 6021 // Handle splats by matching through known shuffle masks 6022 if ((VT.is128BitVector() && NumElem <= 4) || 6023 (VT.is256BitVector() && NumElem <= 8)) 6024 return SDValue(); 6025 6026 // All i16 and i8 vector types can't be used directly by a generic shuffle 6027 // instruction because the target has no such instruction. Generate shuffles 6028 // which repeat i16 and i8 several times until they fit in i32, and then can 6029 // be manipulated by target suported shuffles. After the insertion of the 6030 // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. 6031 return PromoteSplat(SVOp, DAG); 6032 } 6033 6034 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6035 // do it! 6036 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 6037 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6038 if (NewOp.getNode()) 6039 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6040 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6041 // FIXME: Figure out a cleaner way to do this. 6042 // Try to make use of movq to zero out the top part. 6043 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6044 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6045 if (NewOp.getNode()) { 6046 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 6047 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 6048 DAG, Subtarget, dl); 6049 } 6050 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6051 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6052 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 6053 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 6054 DAG, Subtarget, dl); 6055 } 6056 } 6057 return SDValue(); 6058} 6059 6060SDValue 6061X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6062 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6063 SDValue V1 = Op.getOperand(0); 6064 SDValue V2 = Op.getOperand(1); 6065 EVT VT = Op.getValueType(); 6066 DebugLoc dl = Op.getDebugLoc(); 6067 unsigned NumElems = VT.getVectorNumElements(); 6068 bool isMMX = VT.getSizeInBits() == 64; 6069 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6070 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6071 bool V1IsSplat = false; 6072 bool V2IsSplat = false; 6073 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 6074 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 6075 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 6076 MachineFunction &MF = DAG.getMachineFunction(); 6077 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6078 6079 // Shuffle operations on MMX not supported. 6080 if (isMMX) 6081 return Op; 6082 6083 // Vector shuffle lowering takes 3 steps: 6084 // 6085 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6086 // narrowing and commutation of operands should be handled. 6087 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6088 // shuffle nodes. 6089 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6090 // so the shuffle can be broken into other shuffles and the legalizer can 6091 // try the lowering again. 6092 // 6093 // The general ideia is that no vector_shuffle operation should be left to 6094 // be matched during isel, all of them must be converted to a target specific 6095 // node here. 6096 6097 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6098 // narrowing and commutation of operands should be handled. The actual code 6099 // doesn't include all of those, work in progress... 6100 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 6101 if (NewOp.getNode()) 6102 return NewOp; 6103 6104 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6105 // unpckh_undef). Only use pshufd if speed is more important than size. 6106 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 6107 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6108 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 6109 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6110 6111 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 6112 RelaxedMayFoldVectorLoad(V1)) 6113 return getMOVDDup(Op, dl, V1, DAG); 6114 6115 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 6116 return getMOVHighToLow(Op, dl, DAG); 6117 6118 // Use to match splats 6119 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 6120 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6121 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6122 6123 if (X86::isPSHUFDMask(SVOp)) { 6124 // The actual implementation will match the mask in the if above and then 6125 // during isel it can match several different instructions, not only pshufd 6126 // as its name says, sad but true, emulate the behavior for now... 6127 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6128 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6129 6130 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6131 6132 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6133 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6134 6135 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6136 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 6137 TargetMask, DAG); 6138 6139 if (VT == MVT::v4f32) 6140 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 6141 TargetMask, DAG); 6142 } 6143 6144 // Check if this can be converted into a logical shift. 6145 bool isLeft = false; 6146 unsigned ShAmt = 0; 6147 SDValue ShVal; 6148 bool isShift = getSubtarget()->hasSSE2() && 6149 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6150 if (isShift && ShVal.hasOneUse()) { 6151 // If the shifted value has multiple uses, it may be cheaper to use 6152 // v_set0 + movlhps or movhlps, etc. 6153 EVT EltVT = VT.getVectorElementType(); 6154 ShAmt *= EltVT.getSizeInBits(); 6155 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6156 } 6157 6158 if (X86::isMOVLMask(SVOp)) { 6159 if (V1IsUndef) 6160 return V2; 6161 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6162 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6163 if (!X86::isMOVLPMask(SVOp)) { 6164 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6165 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6166 6167 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6168 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6169 } 6170 } 6171 6172 // FIXME: fold these into legal mask. 6173 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 6174 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6175 6176 if (X86::isMOVHLPSMask(SVOp)) 6177 return getMOVHighToLow(Op, dl, DAG); 6178 6179 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 6180 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6181 6182 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 6183 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6184 6185 if (X86::isMOVLPMask(SVOp)) 6186 return getMOVLP(Op, dl, DAG, HasSSE2); 6187 6188 if (ShouldXformToMOVHLPS(SVOp) || 6189 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 6190 return CommuteVectorShuffle(SVOp, DAG); 6191 6192 if (isShift) { 6193 // No better options. Use a vshl / vsrl. 6194 EVT EltVT = VT.getVectorElementType(); 6195 ShAmt *= EltVT.getSizeInBits(); 6196 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6197 } 6198 6199 bool Commuted = false; 6200 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6201 // 1,1,1,1 -> v8i16 though. 6202 V1IsSplat = isSplatVector(V1.getNode()); 6203 V2IsSplat = isSplatVector(V2.getNode()); 6204 6205 // Canonicalize the splat or undef, if present, to be on the RHS. 6206 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 6207 Op = CommuteVectorShuffle(SVOp, DAG); 6208 SVOp = cast<ShuffleVectorSDNode>(Op); 6209 V1 = SVOp->getOperand(0); 6210 V2 = SVOp->getOperand(1); 6211 std::swap(V1IsSplat, V2IsSplat); 6212 std::swap(V1IsUndef, V2IsUndef); 6213 Commuted = true; 6214 } 6215 6216 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 6217 // Shuffling low element of v1 into undef, just return v1. 6218 if (V2IsUndef) 6219 return V1; 6220 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6221 // the instruction selector will not match, so get a canonical MOVL with 6222 // swapped operands to undo the commute. 6223 return getMOVL(DAG, dl, VT, V2, V1); 6224 } 6225 6226 if (X86::isUNPCKLMask(SVOp)) 6227 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 6228 6229 if (X86::isUNPCKHMask(SVOp)) 6230 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 6231 6232 if (V2IsSplat) { 6233 // Normalize mask so all entries that point to V2 points to its first 6234 // element then try to match unpck{h|l} again. If match, return a 6235 // new vector_shuffle with the corrected mask. 6236 SDValue NewMask = NormalizeMask(SVOp, DAG); 6237 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6238 if (NSVOp != SVOp) { 6239 if (X86::isUNPCKLMask(NSVOp, true)) { 6240 return NewMask; 6241 } else if (X86::isUNPCKHMask(NSVOp, true)) { 6242 return NewMask; 6243 } 6244 } 6245 } 6246 6247 if (Commuted) { 6248 // Commute is back and try unpck* again. 6249 // FIXME: this seems wrong. 6250 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6251 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6252 6253 if (X86::isUNPCKLMask(NewSVOp)) 6254 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 6255 6256 if (X86::isUNPCKHMask(NewSVOp)) 6257 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 6258 } 6259 6260 // Normalize the node to match x86 shuffle ops if needed 6261 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 6262 return CommuteVectorShuffle(SVOp, DAG); 6263 6264 // The checks below are all present in isShuffleMaskLegal, but they are 6265 // inlined here right now to enable us to directly emit target specific 6266 // nodes, and remove one by one until they don't return Op anymore. 6267 SmallVector<int, 16> M; 6268 SVOp->getMask(M); 6269 6270 if (isPALIGNRMask(M, VT, HasSSSE3)) 6271 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6272 X86::getShufflePALIGNRImmediate(SVOp), 6273 DAG); 6274 6275 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6276 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6277 if (VT == MVT::v2f64) 6278 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 6279 if (VT == MVT::v2i64) 6280 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 6281 } 6282 6283 if (isPSHUFHWMask(M, VT)) 6284 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6285 X86::getShufflePSHUFHWImmediate(SVOp), 6286 DAG); 6287 6288 if (isPSHUFLWMask(M, VT)) 6289 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6290 X86::getShufflePSHUFLWImmediate(SVOp), 6291 DAG); 6292 6293 if (isSHUFPMask(M, VT)) { 6294 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6295 if (VT == MVT::v4f32 || VT == MVT::v4i32) 6296 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 6297 TargetMask, DAG); 6298 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6299 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 6300 TargetMask, DAG); 6301 } 6302 6303 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 6304 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6305 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 6306 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6307 6308 //===--------------------------------------------------------------------===// 6309 // Generate target specific nodes for 128 or 256-bit shuffles only 6310 // supported in the AVX instruction set. 6311 // 6312 6313 // Handle VPERMILPS* permutations 6314 if (isVPERMILPSMask(M, VT, Subtarget)) 6315 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6316 getShuffleVPERMILPSImmediate(SVOp), DAG); 6317 6318 // Handle VPERMILPD* permutations 6319 if (isVPERMILPDMask(M, VT, Subtarget)) 6320 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6321 getShuffleVPERMILPDImmediate(SVOp), DAG); 6322 6323 //===--------------------------------------------------------------------===// 6324 // Since no target specific shuffle was selected for this generic one, 6325 // lower it into other known shuffles. FIXME: this isn't true yet, but 6326 // this is the plan. 6327 // 6328 6329 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6330 if (VT == MVT::v8i16) { 6331 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6332 if (NewOp.getNode()) 6333 return NewOp; 6334 } 6335 6336 if (VT == MVT::v16i8) { 6337 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6338 if (NewOp.getNode()) 6339 return NewOp; 6340 } 6341 6342 // Handle all 128-bit wide vectors with 4 elements, and match them with 6343 // several different shuffle types. 6344 if (NumElems == 4 && VT.getSizeInBits() == 128) 6345 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6346 6347 // Handle general 256-bit shuffles 6348 if (VT.is256BitVector()) 6349 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6350 6351 return SDValue(); 6352} 6353 6354SDValue 6355X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6356 SelectionDAG &DAG) const { 6357 EVT VT = Op.getValueType(); 6358 DebugLoc dl = Op.getDebugLoc(); 6359 6360 if (Op.getOperand(0).getValueType().getSizeInBits() != 128) 6361 return SDValue(); 6362 6363 if (VT.getSizeInBits() == 8) { 6364 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6365 Op.getOperand(0), Op.getOperand(1)); 6366 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6367 DAG.getValueType(VT)); 6368 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6369 } else if (VT.getSizeInBits() == 16) { 6370 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6371 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6372 if (Idx == 0) 6373 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6374 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6375 DAG.getNode(ISD::BITCAST, dl, 6376 MVT::v4i32, 6377 Op.getOperand(0)), 6378 Op.getOperand(1))); 6379 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6380 Op.getOperand(0), Op.getOperand(1)); 6381 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6382 DAG.getValueType(VT)); 6383 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6384 } else if (VT == MVT::f32) { 6385 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6386 // the result back to FR32 register. It's only worth matching if the 6387 // result has a single use which is a store or a bitcast to i32. And in 6388 // the case of a store, it's not worth it if the index is a constant 0, 6389 // because a MOVSSmr can be used instead, which is smaller and faster. 6390 if (!Op.hasOneUse()) 6391 return SDValue(); 6392 SDNode *User = *Op.getNode()->use_begin(); 6393 if ((User->getOpcode() != ISD::STORE || 6394 (isa<ConstantSDNode>(Op.getOperand(1)) && 6395 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6396 (User->getOpcode() != ISD::BITCAST || 6397 User->getValueType(0) != MVT::i32)) 6398 return SDValue(); 6399 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6400 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6401 Op.getOperand(0)), 6402 Op.getOperand(1)); 6403 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6404 } else if (VT == MVT::i32) { 6405 // ExtractPS works with constant index. 6406 if (isa<ConstantSDNode>(Op.getOperand(1))) 6407 return Op; 6408 } 6409 return SDValue(); 6410} 6411 6412 6413SDValue 6414X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6415 SelectionDAG &DAG) const { 6416 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6417 return SDValue(); 6418 6419 SDValue Vec = Op.getOperand(0); 6420 EVT VecVT = Vec.getValueType(); 6421 6422 // If this is a 256-bit vector result, first extract the 128-bit vector and 6423 // then extract the element from the 128-bit vector. 6424 if (VecVT.getSizeInBits() == 256) { 6425 DebugLoc dl = Op.getNode()->getDebugLoc(); 6426 unsigned NumElems = VecVT.getVectorNumElements(); 6427 SDValue Idx = Op.getOperand(1); 6428 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6429 6430 // Get the 128-bit vector. 6431 bool Upper = IdxVal >= NumElems/2; 6432 Vec = Extract128BitVector(Vec, 6433 DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); 6434 6435 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6436 Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); 6437 } 6438 6439 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6440 6441 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 6442 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6443 if (Res.getNode()) 6444 return Res; 6445 } 6446 6447 EVT VT = Op.getValueType(); 6448 DebugLoc dl = Op.getDebugLoc(); 6449 // TODO: handle v16i8. 6450 if (VT.getSizeInBits() == 16) { 6451 SDValue Vec = Op.getOperand(0); 6452 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6453 if (Idx == 0) 6454 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6455 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6456 DAG.getNode(ISD::BITCAST, dl, 6457 MVT::v4i32, Vec), 6458 Op.getOperand(1))); 6459 // Transform it so it match pextrw which produces a 32-bit result. 6460 EVT EltVT = MVT::i32; 6461 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6462 Op.getOperand(0), Op.getOperand(1)); 6463 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6464 DAG.getValueType(VT)); 6465 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6466 } else if (VT.getSizeInBits() == 32) { 6467 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6468 if (Idx == 0) 6469 return Op; 6470 6471 // SHUFPS the element to the lowest double word, then movss. 6472 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6473 EVT VVT = Op.getOperand(0).getValueType(); 6474 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6475 DAG.getUNDEF(VVT), Mask); 6476 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6477 DAG.getIntPtrConstant(0)); 6478 } else if (VT.getSizeInBits() == 64) { 6479 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6480 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6481 // to match extract_elt for f64. 6482 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6483 if (Idx == 0) 6484 return Op; 6485 6486 // UNPCKHPD the element to the lowest double word, then movsd. 6487 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6488 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6489 int Mask[2] = { 1, -1 }; 6490 EVT VVT = Op.getOperand(0).getValueType(); 6491 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6492 DAG.getUNDEF(VVT), Mask); 6493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6494 DAG.getIntPtrConstant(0)); 6495 } 6496 6497 return SDValue(); 6498} 6499 6500SDValue 6501X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6502 SelectionDAG &DAG) const { 6503 EVT VT = Op.getValueType(); 6504 EVT EltVT = VT.getVectorElementType(); 6505 DebugLoc dl = Op.getDebugLoc(); 6506 6507 SDValue N0 = Op.getOperand(0); 6508 SDValue N1 = Op.getOperand(1); 6509 SDValue N2 = Op.getOperand(2); 6510 6511 if (VT.getSizeInBits() == 256) 6512 return SDValue(); 6513 6514 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6515 isa<ConstantSDNode>(N2)) { 6516 unsigned Opc; 6517 if (VT == MVT::v8i16) 6518 Opc = X86ISD::PINSRW; 6519 else if (VT == MVT::v16i8) 6520 Opc = X86ISD::PINSRB; 6521 else 6522 Opc = X86ISD::PINSRB; 6523 6524 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6525 // argument. 6526 if (N1.getValueType() != MVT::i32) 6527 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6528 if (N2.getValueType() != MVT::i32) 6529 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6530 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6531 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6532 // Bits [7:6] of the constant are the source select. This will always be 6533 // zero here. The DAG Combiner may combine an extract_elt index into these 6534 // bits. For example (insert (extract, 3), 2) could be matched by putting 6535 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6536 // Bits [5:4] of the constant are the destination select. This is the 6537 // value of the incoming immediate. 6538 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6539 // combine either bitwise AND or insert of float 0.0 to set these bits. 6540 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6541 // Create this as a scalar to vector.. 6542 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6543 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6544 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6545 // PINSR* works with constant index. 6546 return Op; 6547 } 6548 return SDValue(); 6549} 6550 6551SDValue 6552X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6553 EVT VT = Op.getValueType(); 6554 EVT EltVT = VT.getVectorElementType(); 6555 6556 DebugLoc dl = Op.getDebugLoc(); 6557 SDValue N0 = Op.getOperand(0); 6558 SDValue N1 = Op.getOperand(1); 6559 SDValue N2 = Op.getOperand(2); 6560 6561 // If this is a 256-bit vector result, first extract the 128-bit vector, 6562 // insert the element into the extracted half and then place it back. 6563 if (VT.getSizeInBits() == 256) { 6564 if (!isa<ConstantSDNode>(N2)) 6565 return SDValue(); 6566 6567 // Get the desired 128-bit vector half. 6568 unsigned NumElems = VT.getVectorNumElements(); 6569 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6570 bool Upper = IdxVal >= NumElems/2; 6571 SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); 6572 SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); 6573 6574 // Insert the element into the desired half. 6575 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, 6576 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); 6577 6578 // Insert the changed part back to the 256-bit vector 6579 return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); 6580 } 6581 6582 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) 6583 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6584 6585 if (EltVT == MVT::i8) 6586 return SDValue(); 6587 6588 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6589 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6590 // as its second argument. 6591 if (N1.getValueType() != MVT::i32) 6592 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6593 if (N2.getValueType() != MVT::i32) 6594 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6595 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6596 } 6597 return SDValue(); 6598} 6599 6600SDValue 6601X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6602 LLVMContext *Context = DAG.getContext(); 6603 DebugLoc dl = Op.getDebugLoc(); 6604 EVT OpVT = Op.getValueType(); 6605 6606 // If this is a 256-bit vector result, first insert into a 128-bit 6607 // vector and then insert into the 256-bit vector. 6608 if (OpVT.getSizeInBits() > 128) { 6609 // Insert into a 128-bit vector. 6610 EVT VT128 = EVT::getVectorVT(*Context, 6611 OpVT.getVectorElementType(), 6612 OpVT.getVectorNumElements() / 2); 6613 6614 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6615 6616 // Insert the 128-bit vector. 6617 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6618 DAG.getConstant(0, MVT::i32), 6619 DAG, dl); 6620 } 6621 6622 if (Op.getValueType() == MVT::v1i64 && 6623 Op.getOperand(0).getValueType() == MVT::i64) 6624 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6625 6626 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6627 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6628 "Expected an SSE type!"); 6629 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6630 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6631} 6632 6633// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6634// a simple subregister reference or explicit instructions to grab 6635// upper bits of a vector. 6636SDValue 6637X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6638 if (Subtarget->hasAVX()) { 6639 DebugLoc dl = Op.getNode()->getDebugLoc(); 6640 SDValue Vec = Op.getNode()->getOperand(0); 6641 SDValue Idx = Op.getNode()->getOperand(1); 6642 6643 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6644 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6645 return Extract128BitVector(Vec, Idx, DAG, dl); 6646 } 6647 } 6648 return SDValue(); 6649} 6650 6651// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6652// simple superregister reference or explicit instructions to insert 6653// the upper bits of a vector. 6654SDValue 6655X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6656 if (Subtarget->hasAVX()) { 6657 DebugLoc dl = Op.getNode()->getDebugLoc(); 6658 SDValue Vec = Op.getNode()->getOperand(0); 6659 SDValue SubVec = Op.getNode()->getOperand(1); 6660 SDValue Idx = Op.getNode()->getOperand(2); 6661 6662 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6663 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6664 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6665 } 6666 } 6667 return SDValue(); 6668} 6669 6670// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6671// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6672// one of the above mentioned nodes. It has to be wrapped because otherwise 6673// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6674// be used to form addressing mode. These wrapped nodes will be selected 6675// into MOV32ri. 6676SDValue 6677X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6678 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6679 6680 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6681 // global base reg. 6682 unsigned char OpFlag = 0; 6683 unsigned WrapperKind = X86ISD::Wrapper; 6684 CodeModel::Model M = getTargetMachine().getCodeModel(); 6685 6686 if (Subtarget->isPICStyleRIPRel() && 6687 (M == CodeModel::Small || M == CodeModel::Kernel)) 6688 WrapperKind = X86ISD::WrapperRIP; 6689 else if (Subtarget->isPICStyleGOT()) 6690 OpFlag = X86II::MO_GOTOFF; 6691 else if (Subtarget->isPICStyleStubPIC()) 6692 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6693 6694 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6695 CP->getAlignment(), 6696 CP->getOffset(), OpFlag); 6697 DebugLoc DL = CP->getDebugLoc(); 6698 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6699 // With PIC, the address is actually $g + Offset. 6700 if (OpFlag) { 6701 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6702 DAG.getNode(X86ISD::GlobalBaseReg, 6703 DebugLoc(), getPointerTy()), 6704 Result); 6705 } 6706 6707 return Result; 6708} 6709 6710SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6711 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6712 6713 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6714 // global base reg. 6715 unsigned char OpFlag = 0; 6716 unsigned WrapperKind = X86ISD::Wrapper; 6717 CodeModel::Model M = getTargetMachine().getCodeModel(); 6718 6719 if (Subtarget->isPICStyleRIPRel() && 6720 (M == CodeModel::Small || M == CodeModel::Kernel)) 6721 WrapperKind = X86ISD::WrapperRIP; 6722 else if (Subtarget->isPICStyleGOT()) 6723 OpFlag = X86II::MO_GOTOFF; 6724 else if (Subtarget->isPICStyleStubPIC()) 6725 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6726 6727 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6728 OpFlag); 6729 DebugLoc DL = JT->getDebugLoc(); 6730 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6731 6732 // With PIC, the address is actually $g + Offset. 6733 if (OpFlag) 6734 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6735 DAG.getNode(X86ISD::GlobalBaseReg, 6736 DebugLoc(), getPointerTy()), 6737 Result); 6738 6739 return Result; 6740} 6741 6742SDValue 6743X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6744 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6745 6746 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6747 // global base reg. 6748 unsigned char OpFlag = 0; 6749 unsigned WrapperKind = X86ISD::Wrapper; 6750 CodeModel::Model M = getTargetMachine().getCodeModel(); 6751 6752 if (Subtarget->isPICStyleRIPRel() && 6753 (M == CodeModel::Small || M == CodeModel::Kernel)) 6754 WrapperKind = X86ISD::WrapperRIP; 6755 else if (Subtarget->isPICStyleGOT()) 6756 OpFlag = X86II::MO_GOTOFF; 6757 else if (Subtarget->isPICStyleStubPIC()) 6758 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6759 6760 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6761 6762 DebugLoc DL = Op.getDebugLoc(); 6763 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6764 6765 6766 // With PIC, the address is actually $g + Offset. 6767 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6768 !Subtarget->is64Bit()) { 6769 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6770 DAG.getNode(X86ISD::GlobalBaseReg, 6771 DebugLoc(), getPointerTy()), 6772 Result); 6773 } 6774 6775 return Result; 6776} 6777 6778SDValue 6779X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6780 // Create the TargetBlockAddressAddress node. 6781 unsigned char OpFlags = 6782 Subtarget->ClassifyBlockAddressReference(); 6783 CodeModel::Model M = getTargetMachine().getCodeModel(); 6784 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6785 DebugLoc dl = Op.getDebugLoc(); 6786 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6787 /*isTarget=*/true, OpFlags); 6788 6789 if (Subtarget->isPICStyleRIPRel() && 6790 (M == CodeModel::Small || M == CodeModel::Kernel)) 6791 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6792 else 6793 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6794 6795 // With PIC, the address is actually $g + Offset. 6796 if (isGlobalRelativeToPICBase(OpFlags)) { 6797 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6798 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6799 Result); 6800 } 6801 6802 return Result; 6803} 6804 6805SDValue 6806X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6807 int64_t Offset, 6808 SelectionDAG &DAG) const { 6809 // Create the TargetGlobalAddress node, folding in the constant 6810 // offset if it is legal. 6811 unsigned char OpFlags = 6812 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6813 CodeModel::Model M = getTargetMachine().getCodeModel(); 6814 SDValue Result; 6815 if (OpFlags == X86II::MO_NO_FLAG && 6816 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6817 // A direct static reference to a global. 6818 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6819 Offset = 0; 6820 } else { 6821 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6822 } 6823 6824 if (Subtarget->isPICStyleRIPRel() && 6825 (M == CodeModel::Small || M == CodeModel::Kernel)) 6826 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6827 else 6828 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6829 6830 // With PIC, the address is actually $g + Offset. 6831 if (isGlobalRelativeToPICBase(OpFlags)) { 6832 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6833 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6834 Result); 6835 } 6836 6837 // For globals that require a load from a stub to get the address, emit the 6838 // load. 6839 if (isGlobalStubReference(OpFlags)) 6840 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6841 MachinePointerInfo::getGOT(), false, false, 0); 6842 6843 // If there was a non-zero offset that we didn't fold, create an explicit 6844 // addition for it. 6845 if (Offset != 0) 6846 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6847 DAG.getConstant(Offset, getPointerTy())); 6848 6849 return Result; 6850} 6851 6852SDValue 6853X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6854 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6855 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6856 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6857} 6858 6859static SDValue 6860GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6861 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6862 unsigned char OperandFlags) { 6863 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6864 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6865 DebugLoc dl = GA->getDebugLoc(); 6866 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6867 GA->getValueType(0), 6868 GA->getOffset(), 6869 OperandFlags); 6870 if (InFlag) { 6871 SDValue Ops[] = { Chain, TGA, *InFlag }; 6872 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6873 } else { 6874 SDValue Ops[] = { Chain, TGA }; 6875 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6876 } 6877 6878 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6879 MFI->setAdjustsStack(true); 6880 6881 SDValue Flag = Chain.getValue(1); 6882 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6883} 6884 6885// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6886static SDValue 6887LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6888 const EVT PtrVT) { 6889 SDValue InFlag; 6890 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6891 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6892 DAG.getNode(X86ISD::GlobalBaseReg, 6893 DebugLoc(), PtrVT), InFlag); 6894 InFlag = Chain.getValue(1); 6895 6896 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6897} 6898 6899// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6900static SDValue 6901LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6902 const EVT PtrVT) { 6903 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6904 X86::RAX, X86II::MO_TLSGD); 6905} 6906 6907// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6908// "local exec" model. 6909static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6910 const EVT PtrVT, TLSModel::Model model, 6911 bool is64Bit) { 6912 DebugLoc dl = GA->getDebugLoc(); 6913 6914 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6915 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6916 is64Bit ? 257 : 256)); 6917 6918 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6919 DAG.getIntPtrConstant(0), 6920 MachinePointerInfo(Ptr), false, false, 0); 6921 6922 unsigned char OperandFlags = 0; 6923 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6924 // initialexec. 6925 unsigned WrapperKind = X86ISD::Wrapper; 6926 if (model == TLSModel::LocalExec) { 6927 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6928 } else if (is64Bit) { 6929 assert(model == TLSModel::InitialExec); 6930 OperandFlags = X86II::MO_GOTTPOFF; 6931 WrapperKind = X86ISD::WrapperRIP; 6932 } else { 6933 assert(model == TLSModel::InitialExec); 6934 OperandFlags = X86II::MO_INDNTPOFF; 6935 } 6936 6937 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6938 // exec) 6939 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6940 GA->getValueType(0), 6941 GA->getOffset(), OperandFlags); 6942 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6943 6944 if (model == TLSModel::InitialExec) 6945 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6946 MachinePointerInfo::getGOT(), false, false, 0); 6947 6948 // The address of the thread local variable is the add of the thread 6949 // pointer with the offset of the variable. 6950 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6951} 6952 6953SDValue 6954X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6955 6956 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6957 const GlobalValue *GV = GA->getGlobal(); 6958 6959 if (Subtarget->isTargetELF()) { 6960 // TODO: implement the "local dynamic" model 6961 // TODO: implement the "initial exec"model for pic executables 6962 6963 // If GV is an alias then use the aliasee for determining 6964 // thread-localness. 6965 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6966 GV = GA->resolveAliasedGlobal(false); 6967 6968 TLSModel::Model model 6969 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6970 6971 switch (model) { 6972 case TLSModel::GeneralDynamic: 6973 case TLSModel::LocalDynamic: // not implemented 6974 if (Subtarget->is64Bit()) 6975 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6976 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6977 6978 case TLSModel::InitialExec: 6979 case TLSModel::LocalExec: 6980 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6981 Subtarget->is64Bit()); 6982 } 6983 } else if (Subtarget->isTargetDarwin()) { 6984 // Darwin only has one model of TLS. Lower to that. 6985 unsigned char OpFlag = 0; 6986 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6987 X86ISD::WrapperRIP : X86ISD::Wrapper; 6988 6989 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6990 // global base reg. 6991 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6992 !Subtarget->is64Bit(); 6993 if (PIC32) 6994 OpFlag = X86II::MO_TLVP_PIC_BASE; 6995 else 6996 OpFlag = X86II::MO_TLVP; 6997 DebugLoc DL = Op.getDebugLoc(); 6998 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6999 GA->getValueType(0), 7000 GA->getOffset(), OpFlag); 7001 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7002 7003 // With PIC32, the address is actually $g + Offset. 7004 if (PIC32) 7005 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7006 DAG.getNode(X86ISD::GlobalBaseReg, 7007 DebugLoc(), getPointerTy()), 7008 Offset); 7009 7010 // Lowering the machine isd will make sure everything is in the right 7011 // location. 7012 SDValue Chain = DAG.getEntryNode(); 7013 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7014 SDValue Args[] = { Chain, Offset }; 7015 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7016 7017 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7018 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7019 MFI->setAdjustsStack(true); 7020 7021 // And our return value (tls address) is in the standard call return value 7022 // location. 7023 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7024 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 7025 } 7026 7027 assert(false && 7028 "TLS not implemented for this target."); 7029 7030 llvm_unreachable("Unreachable"); 7031 return SDValue(); 7032} 7033 7034 7035/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 7036/// take a 2 x i32 value to shift plus a shift amount. 7037SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 7038 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7039 EVT VT = Op.getValueType(); 7040 unsigned VTBits = VT.getSizeInBits(); 7041 DebugLoc dl = Op.getDebugLoc(); 7042 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7043 SDValue ShOpLo = Op.getOperand(0); 7044 SDValue ShOpHi = Op.getOperand(1); 7045 SDValue ShAmt = Op.getOperand(2); 7046 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7047 DAG.getConstant(VTBits - 1, MVT::i8)) 7048 : DAG.getConstant(0, VT); 7049 7050 SDValue Tmp2, Tmp3; 7051 if (Op.getOpcode() == ISD::SHL_PARTS) { 7052 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7053 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7054 } else { 7055 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7056 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7057 } 7058 7059 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7060 DAG.getConstant(VTBits, MVT::i8)); 7061 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7062 AndNode, DAG.getConstant(0, MVT::i8)); 7063 7064 SDValue Hi, Lo; 7065 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7066 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7067 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7068 7069 if (Op.getOpcode() == ISD::SHL_PARTS) { 7070 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7071 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7072 } else { 7073 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7074 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7075 } 7076 7077 SDValue Ops[2] = { Lo, Hi }; 7078 return DAG.getMergeValues(Ops, 2, dl); 7079} 7080 7081SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7082 SelectionDAG &DAG) const { 7083 EVT SrcVT = Op.getOperand(0).getValueType(); 7084 EVT DstVT = Op.getValueType(); 7085 DebugLoc dl = Op.getDebugLoc(); 7086 7087 if (SrcVT.isVector() && DstVT.isVector()) { 7088 unsigned SrcVTSize = SrcVT.getSizeInBits(); 7089 unsigned DstVTSize = DstVT.getSizeInBits(); 7090 7091 // Support directly by the target 7092 if (SrcVTSize == DstVTSize) 7093 return Op; 7094 7095 // Handle v4f64 = sitofp v4i32 7096 if (DstVT != MVT::v4f64 && SrcVT != MVT::v4i32) 7097 return SDValue(); 7098 7099 SDValue V = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Op.getOperand(0)); 7100 return DAG.getNode(ISD::FP_EXTEND, dl, DstVT, V); 7101 } 7102 7103 if (SrcVT.isVector()) 7104 return SDValue(); 7105 7106 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7107 "Unknown SINT_TO_FP to lower!"); 7108 7109 // These are really Legal; return the operand so the caller accepts it as 7110 // Legal. 7111 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7112 return Op; 7113 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7114 Subtarget->is64Bit()) { 7115 return Op; 7116 } 7117 7118 unsigned Size = SrcVT.getSizeInBits()/8; 7119 MachineFunction &MF = DAG.getMachineFunction(); 7120 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7121 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7122 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7123 StackSlot, 7124 MachinePointerInfo::getFixedStack(SSFI), 7125 false, false, 0); 7126 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7127} 7128 7129SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7130 SDValue StackSlot, 7131 SelectionDAG &DAG) const { 7132 // Build the FILD 7133 DebugLoc DL = Op.getDebugLoc(); 7134 SDVTList Tys; 7135 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7136 if (useSSE) 7137 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7138 else 7139 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7140 7141 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7142 7143 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7144 MachineMemOperand *MMO; 7145 if (FI) { 7146 int SSFI = FI->getIndex(); 7147 MMO = 7148 DAG.getMachineFunction() 7149 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7150 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7151 } else { 7152 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7153 StackSlot = StackSlot.getOperand(1); 7154 } 7155 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7156 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7157 X86ISD::FILD, DL, 7158 Tys, Ops, array_lengthof(Ops), 7159 SrcVT, MMO); 7160 7161 if (useSSE) { 7162 Chain = Result.getValue(1); 7163 SDValue InFlag = Result.getValue(2); 7164 7165 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7166 // shouldn't be necessary except that RFP cannot be live across 7167 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7168 MachineFunction &MF = DAG.getMachineFunction(); 7169 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7170 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7171 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7172 Tys = DAG.getVTList(MVT::Other); 7173 SDValue Ops[] = { 7174 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7175 }; 7176 MachineMemOperand *MMO = 7177 DAG.getMachineFunction() 7178 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7179 MachineMemOperand::MOStore, SSFISize, SSFISize); 7180 7181 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7182 Ops, array_lengthof(Ops), 7183 Op.getValueType(), MMO); 7184 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7185 MachinePointerInfo::getFixedStack(SSFI), 7186 false, false, 0); 7187 } 7188 7189 return Result; 7190} 7191 7192// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7193SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7194 SelectionDAG &DAG) const { 7195 // This algorithm is not obvious. Here it is in C code, more or less: 7196 /* 7197 double uint64_to_double( uint32_t hi, uint32_t lo ) { 7198 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 7199 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 7200 7201 // Copy ints to xmm registers. 7202 __m128i xh = _mm_cvtsi32_si128( hi ); 7203 __m128i xl = _mm_cvtsi32_si128( lo ); 7204 7205 // Combine into low half of a single xmm register. 7206 __m128i x = _mm_unpacklo_epi32( xh, xl ); 7207 __m128d d; 7208 double sd; 7209 7210 // Merge in appropriate exponents to give the integer bits the right 7211 // magnitude. 7212 x = _mm_unpacklo_epi32( x, exp ); 7213 7214 // Subtract away the biases to deal with the IEEE-754 double precision 7215 // implicit 1. 7216 d = _mm_sub_pd( (__m128d) x, bias ); 7217 7218 // All conversions up to here are exact. The correctly rounded result is 7219 // calculated using the current rounding mode using the following 7220 // horizontal add. 7221 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7222 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7223 // store doesn't really need to be here (except 7224 // maybe to zero the other double) 7225 return sd; 7226 } 7227 */ 7228 7229 DebugLoc dl = Op.getDebugLoc(); 7230 LLVMContext *Context = DAG.getContext(); 7231 7232 // Build some magic constants. 7233 std::vector<Constant*> CV0; 7234 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7235 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7236 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7237 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7238 Constant *C0 = ConstantVector::get(CV0); 7239 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7240 7241 std::vector<Constant*> CV1; 7242 CV1.push_back( 7243 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7244 CV1.push_back( 7245 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7246 Constant *C1 = ConstantVector::get(CV1); 7247 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7248 7249 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7250 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7251 Op.getOperand(0), 7252 DAG.getIntPtrConstant(1))); 7253 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7254 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7255 Op.getOperand(0), 7256 DAG.getIntPtrConstant(0))); 7257 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7258 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7259 MachinePointerInfo::getConstantPool(), 7260 false, false, 16); 7261 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7262 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7263 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7264 MachinePointerInfo::getConstantPool(), 7265 false, false, 16); 7266 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7267 7268 // Add the halves; easiest way is to swap them into another reg first. 7269 int ShufMask[2] = { 1, -1 }; 7270 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7271 DAG.getUNDEF(MVT::v2f64), ShufMask); 7272 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7273 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7274 DAG.getIntPtrConstant(0)); 7275} 7276 7277// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7278SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7279 SelectionDAG &DAG) const { 7280 DebugLoc dl = Op.getDebugLoc(); 7281 // FP constant to bias correct the final result. 7282 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7283 MVT::f64); 7284 7285 // Load the 32-bit value into an XMM register. 7286 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7287 Op.getOperand(0)); 7288 7289 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7290 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7291 DAG.getIntPtrConstant(0)); 7292 7293 // Or the load with the bias. 7294 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7295 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7296 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7297 MVT::v2f64, Load)), 7298 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7299 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7300 MVT::v2f64, Bias))); 7301 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7302 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7303 DAG.getIntPtrConstant(0)); 7304 7305 // Subtract the bias. 7306 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7307 7308 // Handle final rounding. 7309 EVT DestVT = Op.getValueType(); 7310 7311 if (DestVT.bitsLT(MVT::f64)) { 7312 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7313 DAG.getIntPtrConstant(0)); 7314 } else if (DestVT.bitsGT(MVT::f64)) { 7315 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7316 } 7317 7318 // Handle final rounding. 7319 return Sub; 7320} 7321 7322SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7323 SelectionDAG &DAG) const { 7324 SDValue N0 = Op.getOperand(0); 7325 DebugLoc dl = Op.getDebugLoc(); 7326 7327 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7328 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7329 // the optimization here. 7330 if (DAG.SignBitIsZero(N0)) 7331 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7332 7333 EVT SrcVT = N0.getValueType(); 7334 EVT DstVT = Op.getValueType(); 7335 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7336 return LowerUINT_TO_FP_i64(Op, DAG); 7337 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7338 return LowerUINT_TO_FP_i32(Op, DAG); 7339 7340 // Make a 64-bit buffer, and use it to build an FILD. 7341 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7342 if (SrcVT == MVT::i32) { 7343 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7344 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7345 getPointerTy(), StackSlot, WordOff); 7346 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7347 StackSlot, MachinePointerInfo(), 7348 false, false, 0); 7349 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7350 OffsetSlot, MachinePointerInfo(), 7351 false, false, 0); 7352 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7353 return Fild; 7354 } 7355 7356 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7357 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7358 StackSlot, MachinePointerInfo(), 7359 false, false, 0); 7360 // For i64 source, we need to add the appropriate power of 2 if the input 7361 // was negative. This is the same as the optimization in 7362 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7363 // we must be careful to do the computation in x87 extended precision, not 7364 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7365 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7366 MachineMemOperand *MMO = 7367 DAG.getMachineFunction() 7368 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7369 MachineMemOperand::MOLoad, 8, 8); 7370 7371 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7372 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7373 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7374 MVT::i64, MMO); 7375 7376 APInt FF(32, 0x5F800000ULL); 7377 7378 // Check whether the sign bit is set. 7379 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7380 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7381 ISD::SETLT); 7382 7383 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7384 SDValue FudgePtr = DAG.getConstantPool( 7385 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7386 getPointerTy()); 7387 7388 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7389 SDValue Zero = DAG.getIntPtrConstant(0); 7390 SDValue Four = DAG.getIntPtrConstant(4); 7391 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7392 Zero, Four); 7393 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7394 7395 // Load the value out, extending it from f32 to f80. 7396 // FIXME: Avoid the extend by constructing the right constant pool? 7397 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7398 FudgePtr, MachinePointerInfo::getConstantPool(), 7399 MVT::f32, false, false, 4); 7400 // Extend everything to 80 bits to force it to be done on x87. 7401 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7402 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7403} 7404 7405std::pair<SDValue,SDValue> X86TargetLowering:: 7406FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7407 DebugLoc DL = Op.getDebugLoc(); 7408 7409 EVT DstTy = Op.getValueType(); 7410 7411 if (!IsSigned) { 7412 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7413 DstTy = MVT::i64; 7414 } 7415 7416 assert(DstTy.getSimpleVT() <= MVT::i64 && 7417 DstTy.getSimpleVT() >= MVT::i16 && 7418 "Unknown FP_TO_SINT to lower!"); 7419 7420 // These are really Legal. 7421 if (DstTy == MVT::i32 && 7422 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7423 return std::make_pair(SDValue(), SDValue()); 7424 if (Subtarget->is64Bit() && 7425 DstTy == MVT::i64 && 7426 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7427 return std::make_pair(SDValue(), SDValue()); 7428 7429 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7430 // stack slot. 7431 MachineFunction &MF = DAG.getMachineFunction(); 7432 unsigned MemSize = DstTy.getSizeInBits()/8; 7433 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7434 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7435 7436 7437 7438 unsigned Opc; 7439 switch (DstTy.getSimpleVT().SimpleTy) { 7440 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7441 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7442 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7443 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7444 } 7445 7446 SDValue Chain = DAG.getEntryNode(); 7447 SDValue Value = Op.getOperand(0); 7448 EVT TheVT = Op.getOperand(0).getValueType(); 7449 if (isScalarFPTypeInSSEReg(TheVT)) { 7450 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7451 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7452 MachinePointerInfo::getFixedStack(SSFI), 7453 false, false, 0); 7454 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7455 SDValue Ops[] = { 7456 Chain, StackSlot, DAG.getValueType(TheVT) 7457 }; 7458 7459 MachineMemOperand *MMO = 7460 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7461 MachineMemOperand::MOLoad, MemSize, MemSize); 7462 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7463 DstTy, MMO); 7464 Chain = Value.getValue(1); 7465 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7466 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7467 } 7468 7469 MachineMemOperand *MMO = 7470 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7471 MachineMemOperand::MOStore, MemSize, MemSize); 7472 7473 // Build the FP_TO_INT*_IN_MEM 7474 SDValue Ops[] = { Chain, Value, StackSlot }; 7475 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7476 Ops, 3, DstTy, MMO); 7477 7478 return std::make_pair(FIST, StackSlot); 7479} 7480 7481SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7482 SelectionDAG &DAG) const { 7483 if (Op.getValueType().isVector()) 7484 return SDValue(); 7485 7486 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7487 SDValue FIST = Vals.first, StackSlot = Vals.second; 7488 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7489 if (FIST.getNode() == 0) return Op; 7490 7491 // Load the result. 7492 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7493 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7494} 7495 7496SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7497 SelectionDAG &DAG) const { 7498 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7499 SDValue FIST = Vals.first, StackSlot = Vals.second; 7500 assert(FIST.getNode() && "Unexpected failure"); 7501 7502 // Load the result. 7503 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7504 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7505} 7506 7507SDValue X86TargetLowering::LowerFABS(SDValue Op, 7508 SelectionDAG &DAG) const { 7509 LLVMContext *Context = DAG.getContext(); 7510 DebugLoc dl = Op.getDebugLoc(); 7511 EVT VT = Op.getValueType(); 7512 EVT EltVT = VT; 7513 if (VT.isVector()) 7514 EltVT = VT.getVectorElementType(); 7515 std::vector<Constant*> CV; 7516 if (EltVT == MVT::f64) { 7517 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7518 CV.push_back(C); 7519 CV.push_back(C); 7520 } else { 7521 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7522 CV.push_back(C); 7523 CV.push_back(C); 7524 CV.push_back(C); 7525 CV.push_back(C); 7526 } 7527 Constant *C = ConstantVector::get(CV); 7528 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7529 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7530 MachinePointerInfo::getConstantPool(), 7531 false, false, 16); 7532 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7533} 7534 7535SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7536 LLVMContext *Context = DAG.getContext(); 7537 DebugLoc dl = Op.getDebugLoc(); 7538 EVT VT = Op.getValueType(); 7539 EVT EltVT = VT; 7540 if (VT.isVector()) 7541 EltVT = VT.getVectorElementType(); 7542 std::vector<Constant*> CV; 7543 if (EltVT == MVT::f64) { 7544 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7545 CV.push_back(C); 7546 CV.push_back(C); 7547 } else { 7548 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7549 CV.push_back(C); 7550 CV.push_back(C); 7551 CV.push_back(C); 7552 CV.push_back(C); 7553 } 7554 Constant *C = ConstantVector::get(CV); 7555 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7556 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7557 MachinePointerInfo::getConstantPool(), 7558 false, false, 16); 7559 if (VT.isVector()) { 7560 return DAG.getNode(ISD::BITCAST, dl, VT, 7561 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7562 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7563 Op.getOperand(0)), 7564 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7565 } else { 7566 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7567 } 7568} 7569 7570SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7571 LLVMContext *Context = DAG.getContext(); 7572 SDValue Op0 = Op.getOperand(0); 7573 SDValue Op1 = Op.getOperand(1); 7574 DebugLoc dl = Op.getDebugLoc(); 7575 EVT VT = Op.getValueType(); 7576 EVT SrcVT = Op1.getValueType(); 7577 7578 // If second operand is smaller, extend it first. 7579 if (SrcVT.bitsLT(VT)) { 7580 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7581 SrcVT = VT; 7582 } 7583 // And if it is bigger, shrink it first. 7584 if (SrcVT.bitsGT(VT)) { 7585 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7586 SrcVT = VT; 7587 } 7588 7589 // At this point the operands and the result should have the same 7590 // type, and that won't be f80 since that is not custom lowered. 7591 7592 // First get the sign bit of second operand. 7593 std::vector<Constant*> CV; 7594 if (SrcVT == MVT::f64) { 7595 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7596 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7597 } else { 7598 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7599 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7600 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7601 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7602 } 7603 Constant *C = ConstantVector::get(CV); 7604 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7605 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7606 MachinePointerInfo::getConstantPool(), 7607 false, false, 16); 7608 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7609 7610 // Shift sign bit right or left if the two operands have different types. 7611 if (SrcVT.bitsGT(VT)) { 7612 // Op0 is MVT::f32, Op1 is MVT::f64. 7613 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7614 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7615 DAG.getConstant(32, MVT::i32)); 7616 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7617 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7618 DAG.getIntPtrConstant(0)); 7619 } 7620 7621 // Clear first operand sign bit. 7622 CV.clear(); 7623 if (VT == MVT::f64) { 7624 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7625 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7626 } else { 7627 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7628 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7629 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7630 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7631 } 7632 C = ConstantVector::get(CV); 7633 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7634 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7635 MachinePointerInfo::getConstantPool(), 7636 false, false, 16); 7637 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7638 7639 // Or the value with the sign bit. 7640 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7641} 7642 7643SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 7644 SDValue N0 = Op.getOperand(0); 7645 DebugLoc dl = Op.getDebugLoc(); 7646 EVT VT = Op.getValueType(); 7647 7648 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 7649 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 7650 DAG.getConstant(1, VT)); 7651 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 7652} 7653 7654/// Emit nodes that will be selected as "test Op0,Op0", or something 7655/// equivalent. 7656SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7657 SelectionDAG &DAG) const { 7658 DebugLoc dl = Op.getDebugLoc(); 7659 7660 // CF and OF aren't always set the way we want. Determine which 7661 // of these we need. 7662 bool NeedCF = false; 7663 bool NeedOF = false; 7664 switch (X86CC) { 7665 default: break; 7666 case X86::COND_A: case X86::COND_AE: 7667 case X86::COND_B: case X86::COND_BE: 7668 NeedCF = true; 7669 break; 7670 case X86::COND_G: case X86::COND_GE: 7671 case X86::COND_L: case X86::COND_LE: 7672 case X86::COND_O: case X86::COND_NO: 7673 NeedOF = true; 7674 break; 7675 } 7676 7677 // See if we can use the EFLAGS value from the operand instead of 7678 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7679 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7680 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7681 // Emit a CMP with 0, which is the TEST pattern. 7682 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7683 DAG.getConstant(0, Op.getValueType())); 7684 7685 unsigned Opcode = 0; 7686 unsigned NumOperands = 0; 7687 switch (Op.getNode()->getOpcode()) { 7688 case ISD::ADD: 7689 // Due to an isel shortcoming, be conservative if this add is likely to be 7690 // selected as part of a load-modify-store instruction. When the root node 7691 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7692 // uses of other nodes in the match, such as the ADD in this case. This 7693 // leads to the ADD being left around and reselected, with the result being 7694 // two adds in the output. Alas, even if none our users are stores, that 7695 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7696 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7697 // climbing the DAG back to the root, and it doesn't seem to be worth the 7698 // effort. 7699 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7700 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7701 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7702 goto default_case; 7703 7704 if (ConstantSDNode *C = 7705 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7706 // An add of one will be selected as an INC. 7707 if (C->getAPIntValue() == 1) { 7708 Opcode = X86ISD::INC; 7709 NumOperands = 1; 7710 break; 7711 } 7712 7713 // An add of negative one (subtract of one) will be selected as a DEC. 7714 if (C->getAPIntValue().isAllOnesValue()) { 7715 Opcode = X86ISD::DEC; 7716 NumOperands = 1; 7717 break; 7718 } 7719 } 7720 7721 // Otherwise use a regular EFLAGS-setting add. 7722 Opcode = X86ISD::ADD; 7723 NumOperands = 2; 7724 break; 7725 case ISD::AND: { 7726 // If the primary and result isn't used, don't bother using X86ISD::AND, 7727 // because a TEST instruction will be better. 7728 bool NonFlagUse = false; 7729 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7730 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7731 SDNode *User = *UI; 7732 unsigned UOpNo = UI.getOperandNo(); 7733 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7734 // Look pass truncate. 7735 UOpNo = User->use_begin().getOperandNo(); 7736 User = *User->use_begin(); 7737 } 7738 7739 if (User->getOpcode() != ISD::BRCOND && 7740 User->getOpcode() != ISD::SETCC && 7741 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7742 NonFlagUse = true; 7743 break; 7744 } 7745 } 7746 7747 if (!NonFlagUse) 7748 break; 7749 } 7750 // FALL THROUGH 7751 case ISD::SUB: 7752 case ISD::OR: 7753 case ISD::XOR: 7754 // Due to the ISEL shortcoming noted above, be conservative if this op is 7755 // likely to be selected as part of a load-modify-store instruction. 7756 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7757 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7758 if (UI->getOpcode() == ISD::STORE) 7759 goto default_case; 7760 7761 // Otherwise use a regular EFLAGS-setting instruction. 7762 switch (Op.getNode()->getOpcode()) { 7763 default: llvm_unreachable("unexpected operator!"); 7764 case ISD::SUB: Opcode = X86ISD::SUB; break; 7765 case ISD::OR: Opcode = X86ISD::OR; break; 7766 case ISD::XOR: Opcode = X86ISD::XOR; break; 7767 case ISD::AND: Opcode = X86ISD::AND; break; 7768 } 7769 7770 NumOperands = 2; 7771 break; 7772 case X86ISD::ADD: 7773 case X86ISD::SUB: 7774 case X86ISD::INC: 7775 case X86ISD::DEC: 7776 case X86ISD::OR: 7777 case X86ISD::XOR: 7778 case X86ISD::AND: 7779 return SDValue(Op.getNode(), 1); 7780 default: 7781 default_case: 7782 break; 7783 } 7784 7785 if (Opcode == 0) 7786 // Emit a CMP with 0, which is the TEST pattern. 7787 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7788 DAG.getConstant(0, Op.getValueType())); 7789 7790 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7791 SmallVector<SDValue, 4> Ops; 7792 for (unsigned i = 0; i != NumOperands; ++i) 7793 Ops.push_back(Op.getOperand(i)); 7794 7795 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7796 DAG.ReplaceAllUsesWith(Op, New); 7797 return SDValue(New.getNode(), 1); 7798} 7799 7800/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7801/// equivalent. 7802SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7803 SelectionDAG &DAG) const { 7804 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7805 if (C->getAPIntValue() == 0) 7806 return EmitTest(Op0, X86CC, DAG); 7807 7808 DebugLoc dl = Op0.getDebugLoc(); 7809 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7810} 7811 7812/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7813/// if it's possible. 7814SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7815 DebugLoc dl, SelectionDAG &DAG) const { 7816 SDValue Op0 = And.getOperand(0); 7817 SDValue Op1 = And.getOperand(1); 7818 if (Op0.getOpcode() == ISD::TRUNCATE) 7819 Op0 = Op0.getOperand(0); 7820 if (Op1.getOpcode() == ISD::TRUNCATE) 7821 Op1 = Op1.getOperand(0); 7822 7823 SDValue LHS, RHS; 7824 if (Op1.getOpcode() == ISD::SHL) 7825 std::swap(Op0, Op1); 7826 if (Op0.getOpcode() == ISD::SHL) { 7827 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7828 if (And00C->getZExtValue() == 1) { 7829 // If we looked past a truncate, check that it's only truncating away 7830 // known zeros. 7831 unsigned BitWidth = Op0.getValueSizeInBits(); 7832 unsigned AndBitWidth = And.getValueSizeInBits(); 7833 if (BitWidth > AndBitWidth) { 7834 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7835 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7836 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7837 return SDValue(); 7838 } 7839 LHS = Op1; 7840 RHS = Op0.getOperand(1); 7841 } 7842 } else if (Op1.getOpcode() == ISD::Constant) { 7843 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7844 SDValue AndLHS = Op0; 7845 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7846 LHS = AndLHS.getOperand(0); 7847 RHS = AndLHS.getOperand(1); 7848 } 7849 } 7850 7851 if (LHS.getNode()) { 7852 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7853 // instruction. Since the shift amount is in-range-or-undefined, we know 7854 // that doing a bittest on the i32 value is ok. We extend to i32 because 7855 // the encoding for the i16 version is larger than the i32 version. 7856 // Also promote i16 to i32 for performance / code size reason. 7857 if (LHS.getValueType() == MVT::i8 || 7858 LHS.getValueType() == MVT::i16) 7859 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7860 7861 // If the operand types disagree, extend the shift amount to match. Since 7862 // BT ignores high bits (like shifts) we can use anyextend. 7863 if (LHS.getValueType() != RHS.getValueType()) 7864 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7865 7866 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7867 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7868 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7869 DAG.getConstant(Cond, MVT::i8), BT); 7870 } 7871 7872 return SDValue(); 7873} 7874 7875SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7876 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7877 SDValue Op0 = Op.getOperand(0); 7878 SDValue Op1 = Op.getOperand(1); 7879 DebugLoc dl = Op.getDebugLoc(); 7880 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7881 7882 // Optimize to BT if possible. 7883 // Lower (X & (1 << N)) == 0 to BT(X, N). 7884 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7885 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7886 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7887 Op1.getOpcode() == ISD::Constant && 7888 cast<ConstantSDNode>(Op1)->isNullValue() && 7889 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7890 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7891 if (NewSetCC.getNode()) 7892 return NewSetCC; 7893 } 7894 7895 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7896 // these. 7897 if (Op1.getOpcode() == ISD::Constant && 7898 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7899 cast<ConstantSDNode>(Op1)->isNullValue()) && 7900 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7901 7902 // If the input is a setcc, then reuse the input setcc or use a new one with 7903 // the inverted condition. 7904 if (Op0.getOpcode() == X86ISD::SETCC) { 7905 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7906 bool Invert = (CC == ISD::SETNE) ^ 7907 cast<ConstantSDNode>(Op1)->isNullValue(); 7908 if (!Invert) return Op0; 7909 7910 CCode = X86::GetOppositeBranchCondition(CCode); 7911 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7912 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7913 } 7914 } 7915 7916 bool isFP = Op1.getValueType().isFloatingPoint(); 7917 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7918 if (X86CC == X86::COND_INVALID) 7919 return SDValue(); 7920 7921 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7922 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7923 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7924} 7925 7926SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7927 SDValue Cond; 7928 SDValue Op0 = Op.getOperand(0); 7929 SDValue Op1 = Op.getOperand(1); 7930 SDValue CC = Op.getOperand(2); 7931 EVT VT = Op.getValueType(); 7932 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7933 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7934 DebugLoc dl = Op.getDebugLoc(); 7935 7936 if (isFP) { 7937 unsigned SSECC = 8; 7938 EVT EltVT = Op0.getValueType().getVectorElementType(); 7939 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 7940 7941 unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7942 bool Swap = false; 7943 7944 switch (SetCCOpcode) { 7945 default: break; 7946 case ISD::SETOEQ: 7947 case ISD::SETEQ: SSECC = 0; break; 7948 case ISD::SETOGT: 7949 case ISD::SETGT: Swap = true; // Fallthrough 7950 case ISD::SETLT: 7951 case ISD::SETOLT: SSECC = 1; break; 7952 case ISD::SETOGE: 7953 case ISD::SETGE: Swap = true; // Fallthrough 7954 case ISD::SETLE: 7955 case ISD::SETOLE: SSECC = 2; break; 7956 case ISD::SETUO: SSECC = 3; break; 7957 case ISD::SETUNE: 7958 case ISD::SETNE: SSECC = 4; break; 7959 case ISD::SETULE: Swap = true; 7960 case ISD::SETUGE: SSECC = 5; break; 7961 case ISD::SETULT: Swap = true; 7962 case ISD::SETUGT: SSECC = 6; break; 7963 case ISD::SETO: SSECC = 7; break; 7964 } 7965 if (Swap) 7966 std::swap(Op0, Op1); 7967 7968 // In the two special cases we can't handle, emit two comparisons. 7969 if (SSECC == 8) { 7970 if (SetCCOpcode == ISD::SETUEQ) { 7971 SDValue UNORD, EQ; 7972 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7973 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7974 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7975 } 7976 else if (SetCCOpcode == ISD::SETONE) { 7977 SDValue ORD, NEQ; 7978 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7979 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7980 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7981 } 7982 llvm_unreachable("Illegal FP comparison"); 7983 } 7984 // Handle all other FP comparisons here. 7985 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7986 } 7987 7988 if (!isFP && VT.getSizeInBits() == 256) 7989 return SDValue(); 7990 7991 // We are handling one of the integer comparisons here. Since SSE only has 7992 // GT and EQ comparisons for integer, swapping operands and multiple 7993 // operations may be required for some comparisons. 7994 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7995 bool Swap = false, Invert = false, FlipSigns = false; 7996 7997 switch (VT.getSimpleVT().SimpleTy) { 7998 default: break; 7999 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 8000 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 8001 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 8002 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 8003 } 8004 8005 switch (SetCCOpcode) { 8006 default: break; 8007 case ISD::SETNE: Invert = true; 8008 case ISD::SETEQ: Opc = EQOpc; break; 8009 case ISD::SETLT: Swap = true; 8010 case ISD::SETGT: Opc = GTOpc; break; 8011 case ISD::SETGE: Swap = true; 8012 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 8013 case ISD::SETULT: Swap = true; 8014 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 8015 case ISD::SETUGE: Swap = true; 8016 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 8017 } 8018 if (Swap) 8019 std::swap(Op0, Op1); 8020 8021 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8022 // bits of the inputs before performing those operations. 8023 if (FlipSigns) { 8024 EVT EltVT = VT.getVectorElementType(); 8025 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8026 EltVT); 8027 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8028 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8029 SignBits.size()); 8030 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8031 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8032 } 8033 8034 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8035 8036 // If the logical-not of the result is required, perform that now. 8037 if (Invert) 8038 Result = DAG.getNOT(dl, Result, VT); 8039 8040 return Result; 8041} 8042 8043// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8044static bool isX86LogicalCmp(SDValue Op) { 8045 unsigned Opc = Op.getNode()->getOpcode(); 8046 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 8047 return true; 8048 if (Op.getResNo() == 1 && 8049 (Opc == X86ISD::ADD || 8050 Opc == X86ISD::SUB || 8051 Opc == X86ISD::ADC || 8052 Opc == X86ISD::SBB || 8053 Opc == X86ISD::SMUL || 8054 Opc == X86ISD::UMUL || 8055 Opc == X86ISD::INC || 8056 Opc == X86ISD::DEC || 8057 Opc == X86ISD::OR || 8058 Opc == X86ISD::XOR || 8059 Opc == X86ISD::AND)) 8060 return true; 8061 8062 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8063 return true; 8064 8065 return false; 8066} 8067 8068static bool isZero(SDValue V) { 8069 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8070 return C && C->isNullValue(); 8071} 8072 8073static bool isAllOnes(SDValue V) { 8074 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8075 return C && C->isAllOnesValue(); 8076} 8077 8078SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8079 bool addTest = true; 8080 SDValue Cond = Op.getOperand(0); 8081 SDValue Op1 = Op.getOperand(1); 8082 SDValue Op2 = Op.getOperand(2); 8083 DebugLoc DL = Op.getDebugLoc(); 8084 SDValue CC; 8085 8086 if (Cond.getOpcode() == ISD::SETCC) { 8087 SDValue NewCond = LowerSETCC(Cond, DAG); 8088 if (NewCond.getNode()) 8089 Cond = NewCond; 8090 } 8091 8092 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8093 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8094 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8095 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8096 if (Cond.getOpcode() == X86ISD::SETCC && 8097 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8098 isZero(Cond.getOperand(1).getOperand(1))) { 8099 SDValue Cmp = Cond.getOperand(1); 8100 8101 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8102 8103 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8104 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8105 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8106 8107 SDValue CmpOp0 = Cmp.getOperand(0); 8108 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8109 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8110 8111 SDValue Res = // Res = 0 or -1. 8112 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8113 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8114 8115 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8116 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8117 8118 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8119 if (N2C == 0 || !N2C->isNullValue()) 8120 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8121 return Res; 8122 } 8123 } 8124 8125 // Look past (and (setcc_carry (cmp ...)), 1). 8126 if (Cond.getOpcode() == ISD::AND && 8127 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8128 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8129 if (C && C->getAPIntValue() == 1) 8130 Cond = Cond.getOperand(0); 8131 } 8132 8133 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8134 // setting operand in place of the X86ISD::SETCC. 8135 if (Cond.getOpcode() == X86ISD::SETCC || 8136 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8137 CC = Cond.getOperand(0); 8138 8139 SDValue Cmp = Cond.getOperand(1); 8140 unsigned Opc = Cmp.getOpcode(); 8141 EVT VT = Op.getValueType(); 8142 8143 bool IllegalFPCMov = false; 8144 if (VT.isFloatingPoint() && !VT.isVector() && 8145 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8146 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8147 8148 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8149 Opc == X86ISD::BT) { // FIXME 8150 Cond = Cmp; 8151 addTest = false; 8152 } 8153 } 8154 8155 if (addTest) { 8156 // Look pass the truncate. 8157 if (Cond.getOpcode() == ISD::TRUNCATE) 8158 Cond = Cond.getOperand(0); 8159 8160 // We know the result of AND is compared against zero. Try to match 8161 // it to BT. 8162 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8163 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8164 if (NewSetCC.getNode()) { 8165 CC = NewSetCC.getOperand(0); 8166 Cond = NewSetCC.getOperand(1); 8167 addTest = false; 8168 } 8169 } 8170 } 8171 8172 if (addTest) { 8173 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8174 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8175 } 8176 8177 // a < b ? -1 : 0 -> RES = ~setcc_carry 8178 // a < b ? 0 : -1 -> RES = setcc_carry 8179 // a >= b ? -1 : 0 -> RES = setcc_carry 8180 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8181 if (Cond.getOpcode() == X86ISD::CMP) { 8182 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8183 8184 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8185 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8186 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8187 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8188 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8189 return DAG.getNOT(DL, Res, Res.getValueType()); 8190 return Res; 8191 } 8192 } 8193 8194 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8195 // condition is true. 8196 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8197 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8198 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8199} 8200 8201// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8202// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8203// from the AND / OR. 8204static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8205 Opc = Op.getOpcode(); 8206 if (Opc != ISD::OR && Opc != ISD::AND) 8207 return false; 8208 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8209 Op.getOperand(0).hasOneUse() && 8210 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8211 Op.getOperand(1).hasOneUse()); 8212} 8213 8214// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8215// 1 and that the SETCC node has a single use. 8216static bool isXor1OfSetCC(SDValue Op) { 8217 if (Op.getOpcode() != ISD::XOR) 8218 return false; 8219 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8220 if (N1C && N1C->getAPIntValue() == 1) { 8221 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8222 Op.getOperand(0).hasOneUse(); 8223 } 8224 return false; 8225} 8226 8227SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8228 bool addTest = true; 8229 SDValue Chain = Op.getOperand(0); 8230 SDValue Cond = Op.getOperand(1); 8231 SDValue Dest = Op.getOperand(2); 8232 DebugLoc dl = Op.getDebugLoc(); 8233 SDValue CC; 8234 8235 if (Cond.getOpcode() == ISD::SETCC) { 8236 SDValue NewCond = LowerSETCC(Cond, DAG); 8237 if (NewCond.getNode()) 8238 Cond = NewCond; 8239 } 8240#if 0 8241 // FIXME: LowerXALUO doesn't handle these!! 8242 else if (Cond.getOpcode() == X86ISD::ADD || 8243 Cond.getOpcode() == X86ISD::SUB || 8244 Cond.getOpcode() == X86ISD::SMUL || 8245 Cond.getOpcode() == X86ISD::UMUL) 8246 Cond = LowerXALUO(Cond, DAG); 8247#endif 8248 8249 // Look pass (and (setcc_carry (cmp ...)), 1). 8250 if (Cond.getOpcode() == ISD::AND && 8251 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8252 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8253 if (C && C->getAPIntValue() == 1) 8254 Cond = Cond.getOperand(0); 8255 } 8256 8257 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8258 // setting operand in place of the X86ISD::SETCC. 8259 if (Cond.getOpcode() == X86ISD::SETCC || 8260 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8261 CC = Cond.getOperand(0); 8262 8263 SDValue Cmp = Cond.getOperand(1); 8264 unsigned Opc = Cmp.getOpcode(); 8265 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8266 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8267 Cond = Cmp; 8268 addTest = false; 8269 } else { 8270 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8271 default: break; 8272 case X86::COND_O: 8273 case X86::COND_B: 8274 // These can only come from an arithmetic instruction with overflow, 8275 // e.g. SADDO, UADDO. 8276 Cond = Cond.getNode()->getOperand(1); 8277 addTest = false; 8278 break; 8279 } 8280 } 8281 } else { 8282 unsigned CondOpc; 8283 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8284 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8285 if (CondOpc == ISD::OR) { 8286 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8287 // two branches instead of an explicit OR instruction with a 8288 // separate test. 8289 if (Cmp == Cond.getOperand(1).getOperand(1) && 8290 isX86LogicalCmp(Cmp)) { 8291 CC = Cond.getOperand(0).getOperand(0); 8292 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8293 Chain, Dest, CC, Cmp); 8294 CC = Cond.getOperand(1).getOperand(0); 8295 Cond = Cmp; 8296 addTest = false; 8297 } 8298 } else { // ISD::AND 8299 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8300 // two branches instead of an explicit AND instruction with a 8301 // separate test. However, we only do this if this block doesn't 8302 // have a fall-through edge, because this requires an explicit 8303 // jmp when the condition is false. 8304 if (Cmp == Cond.getOperand(1).getOperand(1) && 8305 isX86LogicalCmp(Cmp) && 8306 Op.getNode()->hasOneUse()) { 8307 X86::CondCode CCode = 8308 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8309 CCode = X86::GetOppositeBranchCondition(CCode); 8310 CC = DAG.getConstant(CCode, MVT::i8); 8311 SDNode *User = *Op.getNode()->use_begin(); 8312 // Look for an unconditional branch following this conditional branch. 8313 // We need this because we need to reverse the successors in order 8314 // to implement FCMP_OEQ. 8315 if (User->getOpcode() == ISD::BR) { 8316 SDValue FalseBB = User->getOperand(1); 8317 SDNode *NewBR = 8318 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8319 assert(NewBR == User); 8320 (void)NewBR; 8321 Dest = FalseBB; 8322 8323 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8324 Chain, Dest, CC, Cmp); 8325 X86::CondCode CCode = 8326 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8327 CCode = X86::GetOppositeBranchCondition(CCode); 8328 CC = DAG.getConstant(CCode, MVT::i8); 8329 Cond = Cmp; 8330 addTest = false; 8331 } 8332 } 8333 } 8334 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8335 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8336 // It should be transformed during dag combiner except when the condition 8337 // is set by a arithmetics with overflow node. 8338 X86::CondCode CCode = 8339 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8340 CCode = X86::GetOppositeBranchCondition(CCode); 8341 CC = DAG.getConstant(CCode, MVT::i8); 8342 Cond = Cond.getOperand(0).getOperand(1); 8343 addTest = false; 8344 } 8345 } 8346 8347 if (addTest) { 8348 // Look pass the truncate. 8349 if (Cond.getOpcode() == ISD::TRUNCATE) 8350 Cond = Cond.getOperand(0); 8351 8352 // We know the result of AND is compared against zero. Try to match 8353 // it to BT. 8354 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8355 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8356 if (NewSetCC.getNode()) { 8357 CC = NewSetCC.getOperand(0); 8358 Cond = NewSetCC.getOperand(1); 8359 addTest = false; 8360 } 8361 } 8362 } 8363 8364 if (addTest) { 8365 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8366 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8367 } 8368 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8369 Chain, Dest, CC, Cond); 8370} 8371 8372 8373// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8374// Calls to _alloca is needed to probe the stack when allocating more than 4k 8375// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8376// that the guard pages used by the OS virtual memory manager are allocated in 8377// correct sequence. 8378SDValue 8379X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8380 SelectionDAG &DAG) const { 8381 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 8382 "This should be used only on Windows targets"); 8383 assert(!Subtarget->isTargetEnvMacho()); 8384 DebugLoc dl = Op.getDebugLoc(); 8385 8386 // Get the inputs. 8387 SDValue Chain = Op.getOperand(0); 8388 SDValue Size = Op.getOperand(1); 8389 // FIXME: Ensure alignment here 8390 8391 SDValue Flag; 8392 8393 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 8394 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8395 8396 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8397 Flag = Chain.getValue(1); 8398 8399 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8400 8401 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8402 Flag = Chain.getValue(1); 8403 8404 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8405 8406 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8407 return DAG.getMergeValues(Ops1, 2, dl); 8408} 8409 8410SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8411 MachineFunction &MF = DAG.getMachineFunction(); 8412 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8413 8414 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8415 DebugLoc DL = Op.getDebugLoc(); 8416 8417 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8418 // vastart just stores the address of the VarArgsFrameIndex slot into the 8419 // memory location argument. 8420 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8421 getPointerTy()); 8422 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8423 MachinePointerInfo(SV), false, false, 0); 8424 } 8425 8426 // __va_list_tag: 8427 // gp_offset (0 - 6 * 8) 8428 // fp_offset (48 - 48 + 8 * 16) 8429 // overflow_arg_area (point to parameters coming in memory). 8430 // reg_save_area 8431 SmallVector<SDValue, 8> MemOps; 8432 SDValue FIN = Op.getOperand(1); 8433 // Store gp_offset 8434 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 8435 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 8436 MVT::i32), 8437 FIN, MachinePointerInfo(SV), false, false, 0); 8438 MemOps.push_back(Store); 8439 8440 // Store fp_offset 8441 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8442 FIN, DAG.getIntPtrConstant(4)); 8443 Store = DAG.getStore(Op.getOperand(0), DL, 8444 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8445 MVT::i32), 8446 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8447 MemOps.push_back(Store); 8448 8449 // Store ptr to overflow_arg_area 8450 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8451 FIN, DAG.getIntPtrConstant(4)); 8452 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8453 getPointerTy()); 8454 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8455 MachinePointerInfo(SV, 8), 8456 false, false, 0); 8457 MemOps.push_back(Store); 8458 8459 // Store ptr to reg_save_area. 8460 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8461 FIN, DAG.getIntPtrConstant(8)); 8462 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8463 getPointerTy()); 8464 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8465 MachinePointerInfo(SV, 16), false, false, 0); 8466 MemOps.push_back(Store); 8467 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8468 &MemOps[0], MemOps.size()); 8469} 8470 8471SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8472 assert(Subtarget->is64Bit() && 8473 "LowerVAARG only handles 64-bit va_arg!"); 8474 assert((Subtarget->isTargetLinux() || 8475 Subtarget->isTargetDarwin()) && 8476 "Unhandled target in LowerVAARG"); 8477 assert(Op.getNode()->getNumOperands() == 4); 8478 SDValue Chain = Op.getOperand(0); 8479 SDValue SrcPtr = Op.getOperand(1); 8480 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8481 unsigned Align = Op.getConstantOperandVal(3); 8482 DebugLoc dl = Op.getDebugLoc(); 8483 8484 EVT ArgVT = Op.getNode()->getValueType(0); 8485 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8486 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8487 uint8_t ArgMode; 8488 8489 // Decide which area this value should be read from. 8490 // TODO: Implement the AMD64 ABI in its entirety. This simple 8491 // selection mechanism works only for the basic types. 8492 if (ArgVT == MVT::f80) { 8493 llvm_unreachable("va_arg for f80 not yet implemented"); 8494 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8495 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8496 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8497 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8498 } else { 8499 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8500 } 8501 8502 if (ArgMode == 2) { 8503 // Sanity Check: Make sure using fp_offset makes sense. 8504 assert(!UseSoftFloat && 8505 !(DAG.getMachineFunction() 8506 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8507 Subtarget->hasXMM()); 8508 } 8509 8510 // Insert VAARG_64 node into the DAG 8511 // VAARG_64 returns two values: Variable Argument Address, Chain 8512 SmallVector<SDValue, 11> InstOps; 8513 InstOps.push_back(Chain); 8514 InstOps.push_back(SrcPtr); 8515 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8516 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8517 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8518 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8519 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8520 VTs, &InstOps[0], InstOps.size(), 8521 MVT::i64, 8522 MachinePointerInfo(SV), 8523 /*Align=*/0, 8524 /*Volatile=*/false, 8525 /*ReadMem=*/true, 8526 /*WriteMem=*/true); 8527 Chain = VAARG.getValue(1); 8528 8529 // Load the next argument and return it 8530 return DAG.getLoad(ArgVT, dl, 8531 Chain, 8532 VAARG, 8533 MachinePointerInfo(), 8534 false, false, 0); 8535} 8536 8537SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8538 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8539 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8540 SDValue Chain = Op.getOperand(0); 8541 SDValue DstPtr = Op.getOperand(1); 8542 SDValue SrcPtr = Op.getOperand(2); 8543 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8544 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8545 DebugLoc DL = Op.getDebugLoc(); 8546 8547 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8548 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8549 false, 8550 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8551} 8552 8553SDValue 8554X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8555 DebugLoc dl = Op.getDebugLoc(); 8556 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8557 switch (IntNo) { 8558 default: return SDValue(); // Don't custom lower most intrinsics. 8559 // Comparison intrinsics. 8560 case Intrinsic::x86_sse_comieq_ss: 8561 case Intrinsic::x86_sse_comilt_ss: 8562 case Intrinsic::x86_sse_comile_ss: 8563 case Intrinsic::x86_sse_comigt_ss: 8564 case Intrinsic::x86_sse_comige_ss: 8565 case Intrinsic::x86_sse_comineq_ss: 8566 case Intrinsic::x86_sse_ucomieq_ss: 8567 case Intrinsic::x86_sse_ucomilt_ss: 8568 case Intrinsic::x86_sse_ucomile_ss: 8569 case Intrinsic::x86_sse_ucomigt_ss: 8570 case Intrinsic::x86_sse_ucomige_ss: 8571 case Intrinsic::x86_sse_ucomineq_ss: 8572 case Intrinsic::x86_sse2_comieq_sd: 8573 case Intrinsic::x86_sse2_comilt_sd: 8574 case Intrinsic::x86_sse2_comile_sd: 8575 case Intrinsic::x86_sse2_comigt_sd: 8576 case Intrinsic::x86_sse2_comige_sd: 8577 case Intrinsic::x86_sse2_comineq_sd: 8578 case Intrinsic::x86_sse2_ucomieq_sd: 8579 case Intrinsic::x86_sse2_ucomilt_sd: 8580 case Intrinsic::x86_sse2_ucomile_sd: 8581 case Intrinsic::x86_sse2_ucomigt_sd: 8582 case Intrinsic::x86_sse2_ucomige_sd: 8583 case Intrinsic::x86_sse2_ucomineq_sd: { 8584 unsigned Opc = 0; 8585 ISD::CondCode CC = ISD::SETCC_INVALID; 8586 switch (IntNo) { 8587 default: break; 8588 case Intrinsic::x86_sse_comieq_ss: 8589 case Intrinsic::x86_sse2_comieq_sd: 8590 Opc = X86ISD::COMI; 8591 CC = ISD::SETEQ; 8592 break; 8593 case Intrinsic::x86_sse_comilt_ss: 8594 case Intrinsic::x86_sse2_comilt_sd: 8595 Opc = X86ISD::COMI; 8596 CC = ISD::SETLT; 8597 break; 8598 case Intrinsic::x86_sse_comile_ss: 8599 case Intrinsic::x86_sse2_comile_sd: 8600 Opc = X86ISD::COMI; 8601 CC = ISD::SETLE; 8602 break; 8603 case Intrinsic::x86_sse_comigt_ss: 8604 case Intrinsic::x86_sse2_comigt_sd: 8605 Opc = X86ISD::COMI; 8606 CC = ISD::SETGT; 8607 break; 8608 case Intrinsic::x86_sse_comige_ss: 8609 case Intrinsic::x86_sse2_comige_sd: 8610 Opc = X86ISD::COMI; 8611 CC = ISD::SETGE; 8612 break; 8613 case Intrinsic::x86_sse_comineq_ss: 8614 case Intrinsic::x86_sse2_comineq_sd: 8615 Opc = X86ISD::COMI; 8616 CC = ISD::SETNE; 8617 break; 8618 case Intrinsic::x86_sse_ucomieq_ss: 8619 case Intrinsic::x86_sse2_ucomieq_sd: 8620 Opc = X86ISD::UCOMI; 8621 CC = ISD::SETEQ; 8622 break; 8623 case Intrinsic::x86_sse_ucomilt_ss: 8624 case Intrinsic::x86_sse2_ucomilt_sd: 8625 Opc = X86ISD::UCOMI; 8626 CC = ISD::SETLT; 8627 break; 8628 case Intrinsic::x86_sse_ucomile_ss: 8629 case Intrinsic::x86_sse2_ucomile_sd: 8630 Opc = X86ISD::UCOMI; 8631 CC = ISD::SETLE; 8632 break; 8633 case Intrinsic::x86_sse_ucomigt_ss: 8634 case Intrinsic::x86_sse2_ucomigt_sd: 8635 Opc = X86ISD::UCOMI; 8636 CC = ISD::SETGT; 8637 break; 8638 case Intrinsic::x86_sse_ucomige_ss: 8639 case Intrinsic::x86_sse2_ucomige_sd: 8640 Opc = X86ISD::UCOMI; 8641 CC = ISD::SETGE; 8642 break; 8643 case Intrinsic::x86_sse_ucomineq_ss: 8644 case Intrinsic::x86_sse2_ucomineq_sd: 8645 Opc = X86ISD::UCOMI; 8646 CC = ISD::SETNE; 8647 break; 8648 } 8649 8650 SDValue LHS = Op.getOperand(1); 8651 SDValue RHS = Op.getOperand(2); 8652 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8653 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8654 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8655 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8656 DAG.getConstant(X86CC, MVT::i8), Cond); 8657 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8658 } 8659 // ptest and testp intrinsics. The intrinsic these come from are designed to 8660 // return an integer value, not just an instruction so lower it to the ptest 8661 // or testp pattern and a setcc for the result. 8662 case Intrinsic::x86_sse41_ptestz: 8663 case Intrinsic::x86_sse41_ptestc: 8664 case Intrinsic::x86_sse41_ptestnzc: 8665 case Intrinsic::x86_avx_ptestz_256: 8666 case Intrinsic::x86_avx_ptestc_256: 8667 case Intrinsic::x86_avx_ptestnzc_256: 8668 case Intrinsic::x86_avx_vtestz_ps: 8669 case Intrinsic::x86_avx_vtestc_ps: 8670 case Intrinsic::x86_avx_vtestnzc_ps: 8671 case Intrinsic::x86_avx_vtestz_pd: 8672 case Intrinsic::x86_avx_vtestc_pd: 8673 case Intrinsic::x86_avx_vtestnzc_pd: 8674 case Intrinsic::x86_avx_vtestz_ps_256: 8675 case Intrinsic::x86_avx_vtestc_ps_256: 8676 case Intrinsic::x86_avx_vtestnzc_ps_256: 8677 case Intrinsic::x86_avx_vtestz_pd_256: 8678 case Intrinsic::x86_avx_vtestc_pd_256: 8679 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8680 bool IsTestPacked = false; 8681 unsigned X86CC = 0; 8682 switch (IntNo) { 8683 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8684 case Intrinsic::x86_avx_vtestz_ps: 8685 case Intrinsic::x86_avx_vtestz_pd: 8686 case Intrinsic::x86_avx_vtestz_ps_256: 8687 case Intrinsic::x86_avx_vtestz_pd_256: 8688 IsTestPacked = true; // Fallthrough 8689 case Intrinsic::x86_sse41_ptestz: 8690 case Intrinsic::x86_avx_ptestz_256: 8691 // ZF = 1 8692 X86CC = X86::COND_E; 8693 break; 8694 case Intrinsic::x86_avx_vtestc_ps: 8695 case Intrinsic::x86_avx_vtestc_pd: 8696 case Intrinsic::x86_avx_vtestc_ps_256: 8697 case Intrinsic::x86_avx_vtestc_pd_256: 8698 IsTestPacked = true; // Fallthrough 8699 case Intrinsic::x86_sse41_ptestc: 8700 case Intrinsic::x86_avx_ptestc_256: 8701 // CF = 1 8702 X86CC = X86::COND_B; 8703 break; 8704 case Intrinsic::x86_avx_vtestnzc_ps: 8705 case Intrinsic::x86_avx_vtestnzc_pd: 8706 case Intrinsic::x86_avx_vtestnzc_ps_256: 8707 case Intrinsic::x86_avx_vtestnzc_pd_256: 8708 IsTestPacked = true; // Fallthrough 8709 case Intrinsic::x86_sse41_ptestnzc: 8710 case Intrinsic::x86_avx_ptestnzc_256: 8711 // ZF and CF = 0 8712 X86CC = X86::COND_A; 8713 break; 8714 } 8715 8716 SDValue LHS = Op.getOperand(1); 8717 SDValue RHS = Op.getOperand(2); 8718 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8719 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8720 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8721 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8722 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8723 } 8724 8725 // Fix vector shift instructions where the last operand is a non-immediate 8726 // i32 value. 8727 case Intrinsic::x86_sse2_pslli_w: 8728 case Intrinsic::x86_sse2_pslli_d: 8729 case Intrinsic::x86_sse2_pslli_q: 8730 case Intrinsic::x86_sse2_psrli_w: 8731 case Intrinsic::x86_sse2_psrli_d: 8732 case Intrinsic::x86_sse2_psrli_q: 8733 case Intrinsic::x86_sse2_psrai_w: 8734 case Intrinsic::x86_sse2_psrai_d: 8735 case Intrinsic::x86_mmx_pslli_w: 8736 case Intrinsic::x86_mmx_pslli_d: 8737 case Intrinsic::x86_mmx_pslli_q: 8738 case Intrinsic::x86_mmx_psrli_w: 8739 case Intrinsic::x86_mmx_psrli_d: 8740 case Intrinsic::x86_mmx_psrli_q: 8741 case Intrinsic::x86_mmx_psrai_w: 8742 case Intrinsic::x86_mmx_psrai_d: { 8743 SDValue ShAmt = Op.getOperand(2); 8744 if (isa<ConstantSDNode>(ShAmt)) 8745 return SDValue(); 8746 8747 unsigned NewIntNo = 0; 8748 EVT ShAmtVT = MVT::v4i32; 8749 switch (IntNo) { 8750 case Intrinsic::x86_sse2_pslli_w: 8751 NewIntNo = Intrinsic::x86_sse2_psll_w; 8752 break; 8753 case Intrinsic::x86_sse2_pslli_d: 8754 NewIntNo = Intrinsic::x86_sse2_psll_d; 8755 break; 8756 case Intrinsic::x86_sse2_pslli_q: 8757 NewIntNo = Intrinsic::x86_sse2_psll_q; 8758 break; 8759 case Intrinsic::x86_sse2_psrli_w: 8760 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8761 break; 8762 case Intrinsic::x86_sse2_psrli_d: 8763 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8764 break; 8765 case Intrinsic::x86_sse2_psrli_q: 8766 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8767 break; 8768 case Intrinsic::x86_sse2_psrai_w: 8769 NewIntNo = Intrinsic::x86_sse2_psra_w; 8770 break; 8771 case Intrinsic::x86_sse2_psrai_d: 8772 NewIntNo = Intrinsic::x86_sse2_psra_d; 8773 break; 8774 default: { 8775 ShAmtVT = MVT::v2i32; 8776 switch (IntNo) { 8777 case Intrinsic::x86_mmx_pslli_w: 8778 NewIntNo = Intrinsic::x86_mmx_psll_w; 8779 break; 8780 case Intrinsic::x86_mmx_pslli_d: 8781 NewIntNo = Intrinsic::x86_mmx_psll_d; 8782 break; 8783 case Intrinsic::x86_mmx_pslli_q: 8784 NewIntNo = Intrinsic::x86_mmx_psll_q; 8785 break; 8786 case Intrinsic::x86_mmx_psrli_w: 8787 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8788 break; 8789 case Intrinsic::x86_mmx_psrli_d: 8790 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8791 break; 8792 case Intrinsic::x86_mmx_psrli_q: 8793 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8794 break; 8795 case Intrinsic::x86_mmx_psrai_w: 8796 NewIntNo = Intrinsic::x86_mmx_psra_w; 8797 break; 8798 case Intrinsic::x86_mmx_psrai_d: 8799 NewIntNo = Intrinsic::x86_mmx_psra_d; 8800 break; 8801 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8802 } 8803 break; 8804 } 8805 } 8806 8807 // The vector shift intrinsics with scalars uses 32b shift amounts but 8808 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8809 // to be zero. 8810 SDValue ShOps[4]; 8811 ShOps[0] = ShAmt; 8812 ShOps[1] = DAG.getConstant(0, MVT::i32); 8813 if (ShAmtVT == MVT::v4i32) { 8814 ShOps[2] = DAG.getUNDEF(MVT::i32); 8815 ShOps[3] = DAG.getUNDEF(MVT::i32); 8816 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8817 } else { 8818 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8819// FIXME this must be lowered to get rid of the invalid type. 8820 } 8821 8822 EVT VT = Op.getValueType(); 8823 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8824 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8825 DAG.getConstant(NewIntNo, MVT::i32), 8826 Op.getOperand(1), ShAmt); 8827 } 8828 } 8829} 8830 8831SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8832 SelectionDAG &DAG) const { 8833 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8834 MFI->setReturnAddressIsTaken(true); 8835 8836 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8837 DebugLoc dl = Op.getDebugLoc(); 8838 8839 if (Depth > 0) { 8840 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8841 SDValue Offset = 8842 DAG.getConstant(TD->getPointerSize(), 8843 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8844 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8845 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8846 FrameAddr, Offset), 8847 MachinePointerInfo(), false, false, 0); 8848 } 8849 8850 // Just load the return address. 8851 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8852 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8853 RetAddrFI, MachinePointerInfo(), false, false, 0); 8854} 8855 8856SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8857 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8858 MFI->setFrameAddressIsTaken(true); 8859 8860 EVT VT = Op.getValueType(); 8861 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8862 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8863 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8864 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8865 while (Depth--) 8866 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8867 MachinePointerInfo(), 8868 false, false, 0); 8869 return FrameAddr; 8870} 8871 8872SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8873 SelectionDAG &DAG) const { 8874 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8875} 8876 8877SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8878 MachineFunction &MF = DAG.getMachineFunction(); 8879 SDValue Chain = Op.getOperand(0); 8880 SDValue Offset = Op.getOperand(1); 8881 SDValue Handler = Op.getOperand(2); 8882 DebugLoc dl = Op.getDebugLoc(); 8883 8884 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8885 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8886 getPointerTy()); 8887 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8888 8889 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8890 DAG.getIntPtrConstant(TD->getPointerSize())); 8891 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8892 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8893 false, false, 0); 8894 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8895 MF.getRegInfo().addLiveOut(StoreAddrReg); 8896 8897 return DAG.getNode(X86ISD::EH_RETURN, dl, 8898 MVT::Other, 8899 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8900} 8901 8902SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8903 SelectionDAG &DAG) const { 8904 SDValue Root = Op.getOperand(0); 8905 SDValue Trmp = Op.getOperand(1); // trampoline 8906 SDValue FPtr = Op.getOperand(2); // nested function 8907 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8908 DebugLoc dl = Op.getDebugLoc(); 8909 8910 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8911 8912 if (Subtarget->is64Bit()) { 8913 SDValue OutChains[6]; 8914 8915 // Large code-model. 8916 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8917 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8918 8919 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 8920 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 8921 8922 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8923 8924 // Load the pointer to the nested function into R11. 8925 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8926 SDValue Addr = Trmp; 8927 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8928 Addr, MachinePointerInfo(TrmpAddr), 8929 false, false, 0); 8930 8931 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8932 DAG.getConstant(2, MVT::i64)); 8933 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8934 MachinePointerInfo(TrmpAddr, 2), 8935 false, false, 2); 8936 8937 // Load the 'nest' parameter value into R10. 8938 // R10 is specified in X86CallingConv.td 8939 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8940 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8941 DAG.getConstant(10, MVT::i64)); 8942 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8943 Addr, MachinePointerInfo(TrmpAddr, 10), 8944 false, false, 0); 8945 8946 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8947 DAG.getConstant(12, MVT::i64)); 8948 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8949 MachinePointerInfo(TrmpAddr, 12), 8950 false, false, 2); 8951 8952 // Jump to the nested function. 8953 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8954 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8955 DAG.getConstant(20, MVT::i64)); 8956 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8957 Addr, MachinePointerInfo(TrmpAddr, 20), 8958 false, false, 0); 8959 8960 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8961 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8962 DAG.getConstant(22, MVT::i64)); 8963 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8964 MachinePointerInfo(TrmpAddr, 22), 8965 false, false, 0); 8966 8967 SDValue Ops[] = 8968 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8969 return DAG.getMergeValues(Ops, 2, dl); 8970 } else { 8971 const Function *Func = 8972 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8973 CallingConv::ID CC = Func->getCallingConv(); 8974 unsigned NestReg; 8975 8976 switch (CC) { 8977 default: 8978 llvm_unreachable("Unsupported calling convention"); 8979 case CallingConv::C: 8980 case CallingConv::X86_StdCall: { 8981 // Pass 'nest' parameter in ECX. 8982 // Must be kept in sync with X86CallingConv.td 8983 NestReg = X86::ECX; 8984 8985 // Check that ECX wasn't needed by an 'inreg' parameter. 8986 FunctionType *FTy = Func->getFunctionType(); 8987 const AttrListPtr &Attrs = Func->getAttributes(); 8988 8989 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8990 unsigned InRegCount = 0; 8991 unsigned Idx = 1; 8992 8993 for (FunctionType::param_iterator I = FTy->param_begin(), 8994 E = FTy->param_end(); I != E; ++I, ++Idx) 8995 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8996 // FIXME: should only count parameters that are lowered to integers. 8997 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8998 8999 if (InRegCount > 2) { 9000 report_fatal_error("Nest register in use - reduce number of inreg" 9001 " parameters!"); 9002 } 9003 } 9004 break; 9005 } 9006 case CallingConv::X86_FastCall: 9007 case CallingConv::X86_ThisCall: 9008 case CallingConv::Fast: 9009 // Pass 'nest' parameter in EAX. 9010 // Must be kept in sync with X86CallingConv.td 9011 NestReg = X86::EAX; 9012 break; 9013 } 9014 9015 SDValue OutChains[4]; 9016 SDValue Addr, Disp; 9017 9018 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9019 DAG.getConstant(10, MVT::i32)); 9020 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 9021 9022 // This is storing the opcode for MOV32ri. 9023 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 9024 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 9025 OutChains[0] = DAG.getStore(Root, dl, 9026 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 9027 Trmp, MachinePointerInfo(TrmpAddr), 9028 false, false, 0); 9029 9030 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9031 DAG.getConstant(1, MVT::i32)); 9032 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 9033 MachinePointerInfo(TrmpAddr, 1), 9034 false, false, 1); 9035 9036 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 9037 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9038 DAG.getConstant(5, MVT::i32)); 9039 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 9040 MachinePointerInfo(TrmpAddr, 5), 9041 false, false, 1); 9042 9043 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9044 DAG.getConstant(6, MVT::i32)); 9045 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 9046 MachinePointerInfo(TrmpAddr, 6), 9047 false, false, 1); 9048 9049 SDValue Ops[] = 9050 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 9051 return DAG.getMergeValues(Ops, 2, dl); 9052 } 9053} 9054 9055SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 9056 SelectionDAG &DAG) const { 9057 /* 9058 The rounding mode is in bits 11:10 of FPSR, and has the following 9059 settings: 9060 00 Round to nearest 9061 01 Round to -inf 9062 10 Round to +inf 9063 11 Round to 0 9064 9065 FLT_ROUNDS, on the other hand, expects the following: 9066 -1 Undefined 9067 0 Round to 0 9068 1 Round to nearest 9069 2 Round to +inf 9070 3 Round to -inf 9071 9072 To perform the conversion, we do: 9073 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 9074 */ 9075 9076 MachineFunction &MF = DAG.getMachineFunction(); 9077 const TargetMachine &TM = MF.getTarget(); 9078 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 9079 unsigned StackAlignment = TFI.getStackAlignment(); 9080 EVT VT = Op.getValueType(); 9081 DebugLoc DL = Op.getDebugLoc(); 9082 9083 // Save FP Control Word to stack slot 9084 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 9085 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 9086 9087 9088 MachineMemOperand *MMO = 9089 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 9090 MachineMemOperand::MOStore, 2, 2); 9091 9092 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 9093 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 9094 DAG.getVTList(MVT::Other), 9095 Ops, 2, MVT::i16, MMO); 9096 9097 // Load FP Control Word from stack slot 9098 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 9099 MachinePointerInfo(), false, false, 0); 9100 9101 // Transform as necessary 9102 SDValue CWD1 = 9103 DAG.getNode(ISD::SRL, DL, MVT::i16, 9104 DAG.getNode(ISD::AND, DL, MVT::i16, 9105 CWD, DAG.getConstant(0x800, MVT::i16)), 9106 DAG.getConstant(11, MVT::i8)); 9107 SDValue CWD2 = 9108 DAG.getNode(ISD::SRL, DL, MVT::i16, 9109 DAG.getNode(ISD::AND, DL, MVT::i16, 9110 CWD, DAG.getConstant(0x400, MVT::i16)), 9111 DAG.getConstant(9, MVT::i8)); 9112 9113 SDValue RetVal = 9114 DAG.getNode(ISD::AND, DL, MVT::i16, 9115 DAG.getNode(ISD::ADD, DL, MVT::i16, 9116 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 9117 DAG.getConstant(1, MVT::i16)), 9118 DAG.getConstant(3, MVT::i16)); 9119 9120 9121 return DAG.getNode((VT.getSizeInBits() < 16 ? 9122 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 9123} 9124 9125SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9126 EVT VT = Op.getValueType(); 9127 EVT OpVT = VT; 9128 unsigned NumBits = VT.getSizeInBits(); 9129 DebugLoc dl = Op.getDebugLoc(); 9130 9131 Op = Op.getOperand(0); 9132 if (VT == MVT::i8) { 9133 // Zero extend to i32 since there is not an i8 bsr. 9134 OpVT = MVT::i32; 9135 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9136 } 9137 9138 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 9139 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9140 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9141 9142 // If src is zero (i.e. bsr sets ZF), returns NumBits. 9143 SDValue Ops[] = { 9144 Op, 9145 DAG.getConstant(NumBits+NumBits-1, OpVT), 9146 DAG.getConstant(X86::COND_E, MVT::i8), 9147 Op.getValue(1) 9148 }; 9149 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9150 9151 // Finally xor with NumBits-1. 9152 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9153 9154 if (VT == MVT::i8) 9155 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9156 return Op; 9157} 9158 9159SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9160 EVT VT = Op.getValueType(); 9161 EVT OpVT = VT; 9162 unsigned NumBits = VT.getSizeInBits(); 9163 DebugLoc dl = Op.getDebugLoc(); 9164 9165 Op = Op.getOperand(0); 9166 if (VT == MVT::i8) { 9167 OpVT = MVT::i32; 9168 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9169 } 9170 9171 // Issue a bsf (scan bits forward) which also sets EFLAGS. 9172 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9173 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 9174 9175 // If src is zero (i.e. bsf sets ZF), returns NumBits. 9176 SDValue Ops[] = { 9177 Op, 9178 DAG.getConstant(NumBits, OpVT), 9179 DAG.getConstant(X86::COND_E, MVT::i8), 9180 Op.getValue(1) 9181 }; 9182 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9183 9184 if (VT == MVT::i8) 9185 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9186 return Op; 9187} 9188 9189SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 9190 EVT VT = Op.getValueType(); 9191 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 9192 DebugLoc dl = Op.getDebugLoc(); 9193 9194 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 9195 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 9196 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 9197 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 9198 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 9199 // 9200 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 9201 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 9202 // return AloBlo + AloBhi + AhiBlo; 9203 9204 SDValue A = Op.getOperand(0); 9205 SDValue B = Op.getOperand(1); 9206 9207 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9208 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9209 A, DAG.getConstant(32, MVT::i32)); 9210 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9211 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9212 B, DAG.getConstant(32, MVT::i32)); 9213 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9214 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9215 A, B); 9216 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9217 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9218 A, Bhi); 9219 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9220 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9221 Ahi, B); 9222 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9223 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9224 AloBhi, DAG.getConstant(32, MVT::i32)); 9225 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9226 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9227 AhiBlo, DAG.getConstant(32, MVT::i32)); 9228 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9229 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9230 return Res; 9231} 9232 9233SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 9234 9235 EVT VT = Op.getValueType(); 9236 DebugLoc dl = Op.getDebugLoc(); 9237 SDValue R = Op.getOperand(0); 9238 SDValue Amt = Op.getOperand(1); 9239 LLVMContext *Context = DAG.getContext(); 9240 9241 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 9242 return SDValue(); 9243 9244 // Decompose 256-bit shifts into smaller 128-bit shifts. 9245 if (VT.getSizeInBits() == 256) { 9246 int NumElems = VT.getVectorNumElements(); 9247 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9248 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9249 9250 // Extract the two vectors 9251 SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); 9252 SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), 9253 DAG, dl); 9254 9255 // Recreate the shift amount vectors 9256 SmallVector<SDValue, 4> Amt1Csts; 9257 SmallVector<SDValue, 4> Amt2Csts; 9258 for (int i = 0; i < NumElems/2; ++i) 9259 Amt1Csts.push_back(Amt->getOperand(i)); 9260 for (int i = NumElems/2; i < NumElems; ++i) 9261 Amt2Csts.push_back(Amt->getOperand(i)); 9262 9263 SDValue Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9264 &Amt1Csts[0], NumElems/2); 9265 SDValue Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9266 &Amt2Csts[0], NumElems/2); 9267 9268 // Issue new vector shifts for the smaller types 9269 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 9270 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 9271 9272 // Concatenate the result back 9273 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 9274 } 9275 9276 // Optimize shl/srl/sra with constant shift amount. 9277 if (isSplatVector(Amt.getNode())) { 9278 SDValue SclrAmt = Amt->getOperand(0); 9279 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 9280 uint64_t ShiftAmt = C->getZExtValue(); 9281 9282 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 9283 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9284 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9285 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9286 9287 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 9288 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9289 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9290 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9291 9292 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 9293 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9294 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9295 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9296 9297 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 9298 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9299 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9300 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9301 9302 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 9303 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9304 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9305 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9306 9307 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 9308 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9309 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9310 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9311 9312 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 9313 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9314 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9315 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9316 9317 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 9318 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9319 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9320 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9321 } 9322 } 9323 9324 // Lower SHL with variable shift amount. 9325 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 9326 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9327 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9328 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 9329 9330 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 9331 9332 std::vector<Constant*> CV(4, CI); 9333 Constant *C = ConstantVector::get(CV); 9334 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9335 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9336 MachinePointerInfo::getConstantPool(), 9337 false, false, 16); 9338 9339 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 9340 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 9341 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 9342 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 9343 } 9344 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 9345 // a = a << 5; 9346 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9347 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9348 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 9349 9350 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 9351 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 9352 9353 std::vector<Constant*> CVM1(16, CM1); 9354 std::vector<Constant*> CVM2(16, CM2); 9355 Constant *C = ConstantVector::get(CVM1); 9356 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9357 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9358 MachinePointerInfo::getConstantPool(), 9359 false, false, 16); 9360 9361 // r = pblendv(r, psllw(r & (char16)15, 4), a); 9362 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9363 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9364 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9365 DAG.getConstant(4, MVT::i32)); 9366 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9367 // a += a 9368 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9369 9370 C = ConstantVector::get(CVM2); 9371 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9372 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9373 MachinePointerInfo::getConstantPool(), 9374 false, false, 16); 9375 9376 // r = pblendv(r, psllw(r & (char16)63, 2), a); 9377 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9378 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9379 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9380 DAG.getConstant(2, MVT::i32)); 9381 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9382 // a += a 9383 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9384 9385 // return pblendv(r, r+r, a); 9386 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 9387 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 9388 return R; 9389 } 9390 return SDValue(); 9391} 9392 9393SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 9394 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 9395 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 9396 // looks for this combo and may remove the "setcc" instruction if the "setcc" 9397 // has only one use. 9398 SDNode *N = Op.getNode(); 9399 SDValue LHS = N->getOperand(0); 9400 SDValue RHS = N->getOperand(1); 9401 unsigned BaseOp = 0; 9402 unsigned Cond = 0; 9403 DebugLoc DL = Op.getDebugLoc(); 9404 switch (Op.getOpcode()) { 9405 default: llvm_unreachable("Unknown ovf instruction!"); 9406 case ISD::SADDO: 9407 // A subtract of one will be selected as a INC. Note that INC doesn't 9408 // set CF, so we can't do this for UADDO. 9409 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9410 if (C->isOne()) { 9411 BaseOp = X86ISD::INC; 9412 Cond = X86::COND_O; 9413 break; 9414 } 9415 BaseOp = X86ISD::ADD; 9416 Cond = X86::COND_O; 9417 break; 9418 case ISD::UADDO: 9419 BaseOp = X86ISD::ADD; 9420 Cond = X86::COND_B; 9421 break; 9422 case ISD::SSUBO: 9423 // A subtract of one will be selected as a DEC. Note that DEC doesn't 9424 // set CF, so we can't do this for USUBO. 9425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9426 if (C->isOne()) { 9427 BaseOp = X86ISD::DEC; 9428 Cond = X86::COND_O; 9429 break; 9430 } 9431 BaseOp = X86ISD::SUB; 9432 Cond = X86::COND_O; 9433 break; 9434 case ISD::USUBO: 9435 BaseOp = X86ISD::SUB; 9436 Cond = X86::COND_B; 9437 break; 9438 case ISD::SMULO: 9439 BaseOp = X86ISD::SMUL; 9440 Cond = X86::COND_O; 9441 break; 9442 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 9443 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 9444 MVT::i32); 9445 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 9446 9447 SDValue SetCC = 9448 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9449 DAG.getConstant(X86::COND_O, MVT::i32), 9450 SDValue(Sum.getNode(), 2)); 9451 9452 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9453 } 9454 } 9455 9456 // Also sets EFLAGS. 9457 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 9458 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 9459 9460 SDValue SetCC = 9461 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 9462 DAG.getConstant(Cond, MVT::i32), 9463 SDValue(Sum.getNode(), 1)); 9464 9465 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9466} 9467 9468SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ 9469 DebugLoc dl = Op.getDebugLoc(); 9470 SDNode* Node = Op.getNode(); 9471 EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); 9472 EVT VT = Node->getValueType(0); 9473 9474 if (Subtarget->hasSSE2() && VT.isVector()) { 9475 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 9476 ExtraVT.getScalarType().getSizeInBits(); 9477 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 9478 9479 unsigned SHLIntrinsicsID = 0; 9480 unsigned SRAIntrinsicsID = 0; 9481 switch (VT.getSimpleVT().SimpleTy) { 9482 default: 9483 return SDValue(); 9484 case MVT::v2i64: { 9485 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q; 9486 SRAIntrinsicsID = 0; 9487 break; 9488 } 9489 case MVT::v4i32: { 9490 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 9491 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 9492 break; 9493 } 9494 case MVT::v8i16: { 9495 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 9496 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 9497 break; 9498 } 9499 } 9500 9501 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9502 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 9503 Node->getOperand(0), ShAmt); 9504 9505 // In case of 1 bit sext, no need to shr 9506 if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; 9507 9508 if (SRAIntrinsicsID) { 9509 Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9510 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 9511 Tmp1, ShAmt); 9512 } 9513 return Tmp1; 9514 } 9515 9516 return SDValue(); 9517} 9518 9519 9520SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 9521 DebugLoc dl = Op.getDebugLoc(); 9522 9523 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 9524 // There isn't any reason to disable it if the target processor supports it. 9525 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 9526 SDValue Chain = Op.getOperand(0); 9527 SDValue Zero = DAG.getConstant(0, MVT::i32); 9528 SDValue Ops[] = { 9529 DAG.getRegister(X86::ESP, MVT::i32), // Base 9530 DAG.getTargetConstant(1, MVT::i8), // Scale 9531 DAG.getRegister(0, MVT::i32), // Index 9532 DAG.getTargetConstant(0, MVT::i32), // Disp 9533 DAG.getRegister(0, MVT::i32), // Segment. 9534 Zero, 9535 Chain 9536 }; 9537 SDNode *Res = 9538 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9539 array_lengthof(Ops)); 9540 return SDValue(Res, 0); 9541 } 9542 9543 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 9544 if (!isDev) 9545 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9546 9547 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 9548 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 9549 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 9550 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 9551 9552 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 9553 if (!Op1 && !Op2 && !Op3 && Op4) 9554 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 9555 9556 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 9557 if (Op1 && !Op2 && !Op3 && !Op4) 9558 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 9559 9560 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 9561 // (MFENCE)>; 9562 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9563} 9564 9565SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 9566 SelectionDAG &DAG) const { 9567 DebugLoc dl = Op.getDebugLoc(); 9568 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 9569 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 9570 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 9571 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 9572 9573 // The only fence that needs an instruction is a sequentially-consistent 9574 // cross-thread fence. 9575 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 9576 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 9577 // no-sse2). There isn't any reason to disable it if the target processor 9578 // supports it. 9579 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 9580 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9581 9582 SDValue Chain = Op.getOperand(0); 9583 SDValue Zero = DAG.getConstant(0, MVT::i32); 9584 SDValue Ops[] = { 9585 DAG.getRegister(X86::ESP, MVT::i32), // Base 9586 DAG.getTargetConstant(1, MVT::i8), // Scale 9587 DAG.getRegister(0, MVT::i32), // Index 9588 DAG.getTargetConstant(0, MVT::i32), // Disp 9589 DAG.getRegister(0, MVT::i32), // Segment. 9590 Zero, 9591 Chain 9592 }; 9593 SDNode *Res = 9594 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9595 array_lengthof(Ops)); 9596 return SDValue(Res, 0); 9597 } 9598 9599 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 9600 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9601} 9602 9603 9604SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 9605 EVT T = Op.getValueType(); 9606 DebugLoc DL = Op.getDebugLoc(); 9607 unsigned Reg = 0; 9608 unsigned size = 0; 9609 switch(T.getSimpleVT().SimpleTy) { 9610 default: 9611 assert(false && "Invalid value type!"); 9612 case MVT::i8: Reg = X86::AL; size = 1; break; 9613 case MVT::i16: Reg = X86::AX; size = 2; break; 9614 case MVT::i32: Reg = X86::EAX; size = 4; break; 9615 case MVT::i64: 9616 assert(Subtarget->is64Bit() && "Node not type legal!"); 9617 Reg = X86::RAX; size = 8; 9618 break; 9619 } 9620 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 9621 Op.getOperand(2), SDValue()); 9622 SDValue Ops[] = { cpIn.getValue(0), 9623 Op.getOperand(1), 9624 Op.getOperand(3), 9625 DAG.getTargetConstant(size, MVT::i8), 9626 cpIn.getValue(1) }; 9627 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9628 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 9629 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 9630 Ops, 5, T, MMO); 9631 SDValue cpOut = 9632 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 9633 return cpOut; 9634} 9635 9636SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 9637 SelectionDAG &DAG) const { 9638 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9639 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9640 SDValue TheChain = Op.getOperand(0); 9641 DebugLoc dl = Op.getDebugLoc(); 9642 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9643 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9644 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9645 rax.getValue(2)); 9646 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9647 DAG.getConstant(32, MVT::i8)); 9648 SDValue Ops[] = { 9649 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9650 rdx.getValue(1) 9651 }; 9652 return DAG.getMergeValues(Ops, 2, dl); 9653} 9654 9655SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9656 SelectionDAG &DAG) const { 9657 EVT SrcVT = Op.getOperand(0).getValueType(); 9658 EVT DstVT = Op.getValueType(); 9659 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9660 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9661 assert((DstVT == MVT::i64 || 9662 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9663 "Unexpected custom BITCAST"); 9664 // i64 <=> MMX conversions are Legal. 9665 if (SrcVT==MVT::i64 && DstVT.isVector()) 9666 return Op; 9667 if (DstVT==MVT::i64 && SrcVT.isVector()) 9668 return Op; 9669 // MMX <=> MMX conversions are Legal. 9670 if (SrcVT.isVector() && DstVT.isVector()) 9671 return Op; 9672 // All other conversions need to be expanded. 9673 return SDValue(); 9674} 9675 9676SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9677 SDNode *Node = Op.getNode(); 9678 DebugLoc dl = Node->getDebugLoc(); 9679 EVT T = Node->getValueType(0); 9680 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9681 DAG.getConstant(0, T), Node->getOperand(2)); 9682 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9683 cast<AtomicSDNode>(Node)->getMemoryVT(), 9684 Node->getOperand(0), 9685 Node->getOperand(1), negOp, 9686 cast<AtomicSDNode>(Node)->getSrcValue(), 9687 cast<AtomicSDNode>(Node)->getAlignment(), 9688 cast<AtomicSDNode>(Node)->getOrdering(), 9689 cast<AtomicSDNode>(Node)->getSynchScope()); 9690} 9691 9692static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9693 EVT VT = Op.getNode()->getValueType(0); 9694 9695 // Let legalize expand this if it isn't a legal type yet. 9696 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9697 return SDValue(); 9698 9699 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9700 9701 unsigned Opc; 9702 bool ExtraOp = false; 9703 switch (Op.getOpcode()) { 9704 default: assert(0 && "Invalid code"); 9705 case ISD::ADDC: Opc = X86ISD::ADD; break; 9706 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9707 case ISD::SUBC: Opc = X86ISD::SUB; break; 9708 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9709 } 9710 9711 if (!ExtraOp) 9712 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9713 Op.getOperand(1)); 9714 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9715 Op.getOperand(1), Op.getOperand(2)); 9716} 9717 9718/// LowerOperation - Provide custom lowering hooks for some operations. 9719/// 9720SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9721 switch (Op.getOpcode()) { 9722 default: llvm_unreachable("Should not custom lower this!"); 9723 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 9724 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9725 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 9726 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9727 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9728 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9729 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9730 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9731 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9732 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9733 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9734 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9735 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9736 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9737 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9738 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9739 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9740 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9741 case ISD::SHL_PARTS: 9742 case ISD::SRA_PARTS: 9743 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 9744 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9745 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9746 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9747 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9748 case ISD::FABS: return LowerFABS(Op, DAG); 9749 case ISD::FNEG: return LowerFNEG(Op, DAG); 9750 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9751 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 9752 case ISD::SETCC: return LowerSETCC(Op, DAG); 9753 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9754 case ISD::SELECT: return LowerSELECT(Op, DAG); 9755 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9756 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9757 case ISD::VASTART: return LowerVASTART(Op, DAG); 9758 case ISD::VAARG: return LowerVAARG(Op, DAG); 9759 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9760 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9761 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9762 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9763 case ISD::FRAME_TO_ARGS_OFFSET: 9764 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9765 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9766 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9767 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9768 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9769 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9770 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9771 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9772 case ISD::SRA: 9773 case ISD::SRL: 9774 case ISD::SHL: return LowerShift(Op, DAG); 9775 case ISD::SADDO: 9776 case ISD::UADDO: 9777 case ISD::SSUBO: 9778 case ISD::USUBO: 9779 case ISD::SMULO: 9780 case ISD::UMULO: return LowerXALUO(Op, DAG); 9781 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9782 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9783 case ISD::ADDC: 9784 case ISD::ADDE: 9785 case ISD::SUBC: 9786 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9787 } 9788} 9789 9790void X86TargetLowering:: 9791ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9792 SelectionDAG &DAG, unsigned NewOp) const { 9793 EVT T = Node->getValueType(0); 9794 DebugLoc dl = Node->getDebugLoc(); 9795 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9796 9797 SDValue Chain = Node->getOperand(0); 9798 SDValue In1 = Node->getOperand(1); 9799 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9800 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9801 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9802 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9803 SDValue Ops[] = { Chain, In1, In2L, In2H }; 9804 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9805 SDValue Result = 9806 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 9807 cast<MemSDNode>(Node)->getMemOperand()); 9808 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 9809 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9810 Results.push_back(Result.getValue(2)); 9811} 9812 9813/// ReplaceNodeResults - Replace a node with an illegal result type 9814/// with a new node built out of custom code. 9815void X86TargetLowering::ReplaceNodeResults(SDNode *N, 9816 SmallVectorImpl<SDValue>&Results, 9817 SelectionDAG &DAG) const { 9818 DebugLoc dl = N->getDebugLoc(); 9819 switch (N->getOpcode()) { 9820 default: 9821 assert(false && "Do not know how to custom type legalize this operation!"); 9822 return; 9823 case ISD::SIGN_EXTEND_INREG: 9824 case ISD::ADDC: 9825 case ISD::ADDE: 9826 case ISD::SUBC: 9827 case ISD::SUBE: 9828 // We don't want to expand or promote these. 9829 return; 9830 case ISD::FP_TO_SINT: { 9831 std::pair<SDValue,SDValue> Vals = 9832 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 9833 SDValue FIST = Vals.first, StackSlot = Vals.second; 9834 if (FIST.getNode() != 0) { 9835 EVT VT = N->getValueType(0); 9836 // Return a load from the stack slot. 9837 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 9838 MachinePointerInfo(), false, false, 0)); 9839 } 9840 return; 9841 } 9842 case ISD::READCYCLECOUNTER: { 9843 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9844 SDValue TheChain = N->getOperand(0); 9845 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9846 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 9847 rd.getValue(1)); 9848 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 9849 eax.getValue(2)); 9850 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 9851 SDValue Ops[] = { eax, edx }; 9852 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 9853 Results.push_back(edx.getValue(1)); 9854 return; 9855 } 9856 case ISD::ATOMIC_CMP_SWAP: { 9857 EVT T = N->getValueType(0); 9858 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 9859 SDValue cpInL, cpInH; 9860 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9861 DAG.getConstant(0, MVT::i32)); 9862 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9863 DAG.getConstant(1, MVT::i32)); 9864 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 9865 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 9866 cpInL.getValue(1)); 9867 SDValue swapInL, swapInH; 9868 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9869 DAG.getConstant(0, MVT::i32)); 9870 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9871 DAG.getConstant(1, MVT::i32)); 9872 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 9873 cpInH.getValue(1)); 9874 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 9875 swapInL.getValue(1)); 9876 SDValue Ops[] = { swapInH.getValue(0), 9877 N->getOperand(1), 9878 swapInH.getValue(1) }; 9879 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9880 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 9881 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 9882 Ops, 3, T, MMO); 9883 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 9884 MVT::i32, Result.getValue(1)); 9885 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 9886 MVT::i32, cpOutL.getValue(2)); 9887 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 9888 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9889 Results.push_back(cpOutH.getValue(1)); 9890 return; 9891 } 9892 case ISD::ATOMIC_LOAD_ADD: 9893 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 9894 return; 9895 case ISD::ATOMIC_LOAD_AND: 9896 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 9897 return; 9898 case ISD::ATOMIC_LOAD_NAND: 9899 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 9900 return; 9901 case ISD::ATOMIC_LOAD_OR: 9902 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 9903 return; 9904 case ISD::ATOMIC_LOAD_SUB: 9905 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 9906 return; 9907 case ISD::ATOMIC_LOAD_XOR: 9908 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 9909 return; 9910 case ISD::ATOMIC_SWAP: 9911 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 9912 return; 9913 } 9914} 9915 9916const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 9917 switch (Opcode) { 9918 default: return NULL; 9919 case X86ISD::BSF: return "X86ISD::BSF"; 9920 case X86ISD::BSR: return "X86ISD::BSR"; 9921 case X86ISD::SHLD: return "X86ISD::SHLD"; 9922 case X86ISD::SHRD: return "X86ISD::SHRD"; 9923 case X86ISD::FAND: return "X86ISD::FAND"; 9924 case X86ISD::FOR: return "X86ISD::FOR"; 9925 case X86ISD::FXOR: return "X86ISD::FXOR"; 9926 case X86ISD::FSRL: return "X86ISD::FSRL"; 9927 case X86ISD::FILD: return "X86ISD::FILD"; 9928 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 9929 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 9930 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 9931 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 9932 case X86ISD::FLD: return "X86ISD::FLD"; 9933 case X86ISD::FST: return "X86ISD::FST"; 9934 case X86ISD::CALL: return "X86ISD::CALL"; 9935 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 9936 case X86ISD::BT: return "X86ISD::BT"; 9937 case X86ISD::CMP: return "X86ISD::CMP"; 9938 case X86ISD::COMI: return "X86ISD::COMI"; 9939 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 9940 case X86ISD::SETCC: return "X86ISD::SETCC"; 9941 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 9942 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 9943 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 9944 case X86ISD::CMOV: return "X86ISD::CMOV"; 9945 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 9946 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 9947 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 9948 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 9949 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 9950 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 9951 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 9952 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 9953 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 9954 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 9955 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 9956 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 9957 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 9958 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 9959 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 9960 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 9961 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 9962 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 9963 case X86ISD::FMAX: return "X86ISD::FMAX"; 9964 case X86ISD::FMIN: return "X86ISD::FMIN"; 9965 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9966 case X86ISD::FRCP: return "X86ISD::FRCP"; 9967 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9968 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9969 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9970 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9971 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9972 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9973 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9974 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9975 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9976 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9977 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9978 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9979 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9980 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9981 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9982 case X86ISD::VSHL: return "X86ISD::VSHL"; 9983 case X86ISD::VSRL: return "X86ISD::VSRL"; 9984 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9985 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9986 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9987 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9988 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9989 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9990 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9991 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9992 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9993 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9994 case X86ISD::ADD: return "X86ISD::ADD"; 9995 case X86ISD::SUB: return "X86ISD::SUB"; 9996 case X86ISD::ADC: return "X86ISD::ADC"; 9997 case X86ISD::SBB: return "X86ISD::SBB"; 9998 case X86ISD::SMUL: return "X86ISD::SMUL"; 9999 case X86ISD::UMUL: return "X86ISD::UMUL"; 10000 case X86ISD::INC: return "X86ISD::INC"; 10001 case X86ISD::DEC: return "X86ISD::DEC"; 10002 case X86ISD::OR: return "X86ISD::OR"; 10003 case X86ISD::XOR: return "X86ISD::XOR"; 10004 case X86ISD::AND: return "X86ISD::AND"; 10005 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 10006 case X86ISD::PTEST: return "X86ISD::PTEST"; 10007 case X86ISD::TESTP: return "X86ISD::TESTP"; 10008 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 10009 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 10010 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 10011 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 10012 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 10013 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 10014 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 10015 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 10016 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 10017 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 10018 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 10019 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 10020 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 10021 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 10022 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 10023 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 10024 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 10025 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 10026 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 10027 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 10028 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 10029 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 10030 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 10031 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 10032 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 10033 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 10034 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 10035 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 10036 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 10037 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 10038 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 10039 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 10040 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 10041 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 10042 case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; 10043 case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; 10044 case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; 10045 case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; 10046 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 10047 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 10048 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 10049 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 10050 } 10051} 10052 10053// isLegalAddressingMode - Return true if the addressing mode represented 10054// by AM is legal for this target, for a load/store of the specified type. 10055bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 10056 Type *Ty) const { 10057 // X86 supports extremely general addressing modes. 10058 CodeModel::Model M = getTargetMachine().getCodeModel(); 10059 Reloc::Model R = getTargetMachine().getRelocationModel(); 10060 10061 // X86 allows a sign-extended 32-bit immediate field as a displacement. 10062 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 10063 return false; 10064 10065 if (AM.BaseGV) { 10066 unsigned GVFlags = 10067 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 10068 10069 // If a reference to this global requires an extra load, we can't fold it. 10070 if (isGlobalStubReference(GVFlags)) 10071 return false; 10072 10073 // If BaseGV requires a register for the PIC base, we cannot also have a 10074 // BaseReg specified. 10075 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 10076 return false; 10077 10078 // If lower 4G is not available, then we must use rip-relative addressing. 10079 if ((M != CodeModel::Small || R != Reloc::Static) && 10080 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 10081 return false; 10082 } 10083 10084 switch (AM.Scale) { 10085 case 0: 10086 case 1: 10087 case 2: 10088 case 4: 10089 case 8: 10090 // These scales always work. 10091 break; 10092 case 3: 10093 case 5: 10094 case 9: 10095 // These scales are formed with basereg+scalereg. Only accept if there is 10096 // no basereg yet. 10097 if (AM.HasBaseReg) 10098 return false; 10099 break; 10100 default: // Other stuff never works. 10101 return false; 10102 } 10103 10104 return true; 10105} 10106 10107 10108bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 10109 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 10110 return false; 10111 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 10112 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 10113 if (NumBits1 <= NumBits2) 10114 return false; 10115 return true; 10116} 10117 10118bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 10119 if (!VT1.isInteger() || !VT2.isInteger()) 10120 return false; 10121 unsigned NumBits1 = VT1.getSizeInBits(); 10122 unsigned NumBits2 = VT2.getSizeInBits(); 10123 if (NumBits1 <= NumBits2) 10124 return false; 10125 return true; 10126} 10127 10128bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 10129 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10130 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 10131} 10132 10133bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 10134 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10135 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 10136} 10137 10138bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 10139 // i16 instructions are longer (0x66 prefix) and potentially slower. 10140 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 10141} 10142 10143/// isShuffleMaskLegal - Targets can use this to indicate that they only 10144/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 10145/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 10146/// are assumed to be legal. 10147bool 10148X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 10149 EVT VT) const { 10150 // Very little shuffling can be done for 64-bit vectors right now. 10151 if (VT.getSizeInBits() == 64) 10152 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 10153 10154 // FIXME: pshufb, blends, shifts. 10155 return (VT.getVectorNumElements() == 2 || 10156 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 10157 isMOVLMask(M, VT) || 10158 isSHUFPMask(M, VT) || 10159 isPSHUFDMask(M, VT) || 10160 isPSHUFHWMask(M, VT) || 10161 isPSHUFLWMask(M, VT) || 10162 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 10163 isUNPCKLMask(M, VT) || 10164 isUNPCKHMask(M, VT) || 10165 isUNPCKL_v_undef_Mask(M, VT) || 10166 isUNPCKH_v_undef_Mask(M, VT)); 10167} 10168 10169bool 10170X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 10171 EVT VT) const { 10172 unsigned NumElts = VT.getVectorNumElements(); 10173 // FIXME: This collection of masks seems suspect. 10174 if (NumElts == 2) 10175 return true; 10176 if (NumElts == 4 && VT.getSizeInBits() == 128) { 10177 return (isMOVLMask(Mask, VT) || 10178 isCommutedMOVLMask(Mask, VT, true) || 10179 isSHUFPMask(Mask, VT) || 10180 isCommutedSHUFPMask(Mask, VT)); 10181 } 10182 return false; 10183} 10184 10185//===----------------------------------------------------------------------===// 10186// X86 Scheduler Hooks 10187//===----------------------------------------------------------------------===// 10188 10189// private utility function 10190MachineBasicBlock * 10191X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 10192 MachineBasicBlock *MBB, 10193 unsigned regOpc, 10194 unsigned immOpc, 10195 unsigned LoadOpc, 10196 unsigned CXchgOpc, 10197 unsigned notOpc, 10198 unsigned EAXreg, 10199 TargetRegisterClass *RC, 10200 bool invSrc) const { 10201 // For the atomic bitwise operator, we generate 10202 // thisMBB: 10203 // newMBB: 10204 // ld t1 = [bitinstr.addr] 10205 // op t2 = t1, [bitinstr.val] 10206 // mov EAX = t1 10207 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10208 // bz newMBB 10209 // fallthrough -->nextMBB 10210 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10211 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10212 MachineFunction::iterator MBBIter = MBB; 10213 ++MBBIter; 10214 10215 /// First build the CFG 10216 MachineFunction *F = MBB->getParent(); 10217 MachineBasicBlock *thisMBB = MBB; 10218 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10219 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10220 F->insert(MBBIter, newMBB); 10221 F->insert(MBBIter, nextMBB); 10222 10223 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10224 nextMBB->splice(nextMBB->begin(), thisMBB, 10225 llvm::next(MachineBasicBlock::iterator(bInstr)), 10226 thisMBB->end()); 10227 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10228 10229 // Update thisMBB to fall through to newMBB 10230 thisMBB->addSuccessor(newMBB); 10231 10232 // newMBB jumps to itself and fall through to nextMBB 10233 newMBB->addSuccessor(nextMBB); 10234 newMBB->addSuccessor(newMBB); 10235 10236 // Insert instructions into newMBB based on incoming instruction 10237 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10238 "unexpected number of operands"); 10239 DebugLoc dl = bInstr->getDebugLoc(); 10240 MachineOperand& destOper = bInstr->getOperand(0); 10241 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10242 int numArgs = bInstr->getNumOperands() - 1; 10243 for (int i=0; i < numArgs; ++i) 10244 argOpers[i] = &bInstr->getOperand(i+1); 10245 10246 // x86 address has 4 operands: base, index, scale, and displacement 10247 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10248 int valArgIndx = lastAddrIndx + 1; 10249 10250 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10251 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 10252 for (int i=0; i <= lastAddrIndx; ++i) 10253 (*MIB).addOperand(*argOpers[i]); 10254 10255 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 10256 if (invSrc) { 10257 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 10258 } 10259 else 10260 tt = t1; 10261 10262 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10263 assert((argOpers[valArgIndx]->isReg() || 10264 argOpers[valArgIndx]->isImm()) && 10265 "invalid operand"); 10266 if (argOpers[valArgIndx]->isReg()) 10267 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 10268 else 10269 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 10270 MIB.addReg(tt); 10271 (*MIB).addOperand(*argOpers[valArgIndx]); 10272 10273 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 10274 MIB.addReg(t1); 10275 10276 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 10277 for (int i=0; i <= lastAddrIndx; ++i) 10278 (*MIB).addOperand(*argOpers[i]); 10279 MIB.addReg(t2); 10280 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10281 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10282 bInstr->memoperands_end()); 10283 10284 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10285 MIB.addReg(EAXreg); 10286 10287 // insert branch 10288 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10289 10290 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10291 return nextMBB; 10292} 10293 10294// private utility function: 64 bit atomics on 32 bit host. 10295MachineBasicBlock * 10296X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 10297 MachineBasicBlock *MBB, 10298 unsigned regOpcL, 10299 unsigned regOpcH, 10300 unsigned immOpcL, 10301 unsigned immOpcH, 10302 bool invSrc) const { 10303 // For the atomic bitwise operator, we generate 10304 // thisMBB (instructions are in pairs, except cmpxchg8b) 10305 // ld t1,t2 = [bitinstr.addr] 10306 // newMBB: 10307 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 10308 // op t5, t6 <- out1, out2, [bitinstr.val] 10309 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 10310 // mov ECX, EBX <- t5, t6 10311 // mov EAX, EDX <- t1, t2 10312 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 10313 // mov t3, t4 <- EAX, EDX 10314 // bz newMBB 10315 // result in out1, out2 10316 // fallthrough -->nextMBB 10317 10318 const TargetRegisterClass *RC = X86::GR32RegisterClass; 10319 const unsigned LoadOpc = X86::MOV32rm; 10320 const unsigned NotOpc = X86::NOT32r; 10321 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10322 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10323 MachineFunction::iterator MBBIter = MBB; 10324 ++MBBIter; 10325 10326 /// First build the CFG 10327 MachineFunction *F = MBB->getParent(); 10328 MachineBasicBlock *thisMBB = MBB; 10329 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10330 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10331 F->insert(MBBIter, newMBB); 10332 F->insert(MBBIter, nextMBB); 10333 10334 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10335 nextMBB->splice(nextMBB->begin(), thisMBB, 10336 llvm::next(MachineBasicBlock::iterator(bInstr)), 10337 thisMBB->end()); 10338 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10339 10340 // Update thisMBB to fall through to newMBB 10341 thisMBB->addSuccessor(newMBB); 10342 10343 // newMBB jumps to itself and fall through to nextMBB 10344 newMBB->addSuccessor(nextMBB); 10345 newMBB->addSuccessor(newMBB); 10346 10347 DebugLoc dl = bInstr->getDebugLoc(); 10348 // Insert instructions into newMBB based on incoming instruction 10349 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 10350 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 10351 "unexpected number of operands"); 10352 MachineOperand& dest1Oper = bInstr->getOperand(0); 10353 MachineOperand& dest2Oper = bInstr->getOperand(1); 10354 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10355 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 10356 argOpers[i] = &bInstr->getOperand(i+2); 10357 10358 // We use some of the operands multiple times, so conservatively just 10359 // clear any kill flags that might be present. 10360 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 10361 argOpers[i]->setIsKill(false); 10362 } 10363 10364 // x86 address has 5 operands: base, index, scale, displacement, and segment. 10365 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10366 10367 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10368 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 10369 for (int i=0; i <= lastAddrIndx; ++i) 10370 (*MIB).addOperand(*argOpers[i]); 10371 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10372 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 10373 // add 4 to displacement. 10374 for (int i=0; i <= lastAddrIndx-2; ++i) 10375 (*MIB).addOperand(*argOpers[i]); 10376 MachineOperand newOp3 = *(argOpers[3]); 10377 if (newOp3.isImm()) 10378 newOp3.setImm(newOp3.getImm()+4); 10379 else 10380 newOp3.setOffset(newOp3.getOffset()+4); 10381 (*MIB).addOperand(newOp3); 10382 (*MIB).addOperand(*argOpers[lastAddrIndx]); 10383 10384 // t3/4 are defined later, at the bottom of the loop 10385 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 10386 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 10387 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 10388 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 10389 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 10390 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 10391 10392 // The subsequent operations should be using the destination registers of 10393 //the PHI instructions. 10394 if (invSrc) { 10395 t1 = F->getRegInfo().createVirtualRegister(RC); 10396 t2 = F->getRegInfo().createVirtualRegister(RC); 10397 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 10398 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 10399 } else { 10400 t1 = dest1Oper.getReg(); 10401 t2 = dest2Oper.getReg(); 10402 } 10403 10404 int valArgIndx = lastAddrIndx + 1; 10405 assert((argOpers[valArgIndx]->isReg() || 10406 argOpers[valArgIndx]->isImm()) && 10407 "invalid operand"); 10408 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 10409 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 10410 if (argOpers[valArgIndx]->isReg()) 10411 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 10412 else 10413 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 10414 if (regOpcL != X86::MOV32rr) 10415 MIB.addReg(t1); 10416 (*MIB).addOperand(*argOpers[valArgIndx]); 10417 assert(argOpers[valArgIndx + 1]->isReg() == 10418 argOpers[valArgIndx]->isReg()); 10419 assert(argOpers[valArgIndx + 1]->isImm() == 10420 argOpers[valArgIndx]->isImm()); 10421 if (argOpers[valArgIndx + 1]->isReg()) 10422 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 10423 else 10424 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 10425 if (regOpcH != X86::MOV32rr) 10426 MIB.addReg(t2); 10427 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 10428 10429 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10430 MIB.addReg(t1); 10431 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 10432 MIB.addReg(t2); 10433 10434 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 10435 MIB.addReg(t5); 10436 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 10437 MIB.addReg(t6); 10438 10439 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 10440 for (int i=0; i <= lastAddrIndx; ++i) 10441 (*MIB).addOperand(*argOpers[i]); 10442 10443 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10444 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10445 bInstr->memoperands_end()); 10446 10447 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 10448 MIB.addReg(X86::EAX); 10449 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 10450 MIB.addReg(X86::EDX); 10451 10452 // insert branch 10453 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10454 10455 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10456 return nextMBB; 10457} 10458 10459// private utility function 10460MachineBasicBlock * 10461X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 10462 MachineBasicBlock *MBB, 10463 unsigned cmovOpc) const { 10464 // For the atomic min/max operator, we generate 10465 // thisMBB: 10466 // newMBB: 10467 // ld t1 = [min/max.addr] 10468 // mov t2 = [min/max.val] 10469 // cmp t1, t2 10470 // cmov[cond] t2 = t1 10471 // mov EAX = t1 10472 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10473 // bz newMBB 10474 // fallthrough -->nextMBB 10475 // 10476 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10477 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10478 MachineFunction::iterator MBBIter = MBB; 10479 ++MBBIter; 10480 10481 /// First build the CFG 10482 MachineFunction *F = MBB->getParent(); 10483 MachineBasicBlock *thisMBB = MBB; 10484 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10485 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10486 F->insert(MBBIter, newMBB); 10487 F->insert(MBBIter, nextMBB); 10488 10489 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10490 nextMBB->splice(nextMBB->begin(), thisMBB, 10491 llvm::next(MachineBasicBlock::iterator(mInstr)), 10492 thisMBB->end()); 10493 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10494 10495 // Update thisMBB to fall through to newMBB 10496 thisMBB->addSuccessor(newMBB); 10497 10498 // newMBB jumps to newMBB and fall through to nextMBB 10499 newMBB->addSuccessor(nextMBB); 10500 newMBB->addSuccessor(newMBB); 10501 10502 DebugLoc dl = mInstr->getDebugLoc(); 10503 // Insert instructions into newMBB based on incoming instruction 10504 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10505 "unexpected number of operands"); 10506 MachineOperand& destOper = mInstr->getOperand(0); 10507 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10508 int numArgs = mInstr->getNumOperands() - 1; 10509 for (int i=0; i < numArgs; ++i) 10510 argOpers[i] = &mInstr->getOperand(i+1); 10511 10512 // x86 address has 4 operands: base, index, scale, and displacement 10513 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10514 int valArgIndx = lastAddrIndx + 1; 10515 10516 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10517 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 10518 for (int i=0; i <= lastAddrIndx; ++i) 10519 (*MIB).addOperand(*argOpers[i]); 10520 10521 // We only support register and immediate values 10522 assert((argOpers[valArgIndx]->isReg() || 10523 argOpers[valArgIndx]->isImm()) && 10524 "invalid operand"); 10525 10526 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10527 if (argOpers[valArgIndx]->isReg()) 10528 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 10529 else 10530 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 10531 (*MIB).addOperand(*argOpers[valArgIndx]); 10532 10533 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10534 MIB.addReg(t1); 10535 10536 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 10537 MIB.addReg(t1); 10538 MIB.addReg(t2); 10539 10540 // Generate movc 10541 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10542 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 10543 MIB.addReg(t2); 10544 MIB.addReg(t1); 10545 10546 // Cmp and exchange if none has modified the memory location 10547 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 10548 for (int i=0; i <= lastAddrIndx; ++i) 10549 (*MIB).addOperand(*argOpers[i]); 10550 MIB.addReg(t3); 10551 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10552 (*MIB).setMemRefs(mInstr->memoperands_begin(), 10553 mInstr->memoperands_end()); 10554 10555 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10556 MIB.addReg(X86::EAX); 10557 10558 // insert branch 10559 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10560 10561 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 10562 return nextMBB; 10563} 10564 10565// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 10566// or XMM0_V32I8 in AVX all of this code can be replaced with that 10567// in the .td file. 10568MachineBasicBlock * 10569X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 10570 unsigned numArgs, bool memArg) const { 10571 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 10572 "Target must have SSE4.2 or AVX features enabled"); 10573 10574 DebugLoc dl = MI->getDebugLoc(); 10575 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10576 unsigned Opc; 10577 if (!Subtarget->hasAVX()) { 10578 if (memArg) 10579 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 10580 else 10581 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 10582 } else { 10583 if (memArg) 10584 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 10585 else 10586 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 10587 } 10588 10589 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 10590 for (unsigned i = 0; i < numArgs; ++i) { 10591 MachineOperand &Op = MI->getOperand(i+1); 10592 if (!(Op.isReg() && Op.isImplicit())) 10593 MIB.addOperand(Op); 10594 } 10595 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 10596 .addReg(X86::XMM0); 10597 10598 MI->eraseFromParent(); 10599 return BB; 10600} 10601 10602MachineBasicBlock * 10603X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 10604 DebugLoc dl = MI->getDebugLoc(); 10605 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10606 10607 // Address into RAX/EAX, other two args into ECX, EDX. 10608 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 10609 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 10610 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 10611 for (int i = 0; i < X86::AddrNumOperands; ++i) 10612 MIB.addOperand(MI->getOperand(i)); 10613 10614 unsigned ValOps = X86::AddrNumOperands; 10615 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10616 .addReg(MI->getOperand(ValOps).getReg()); 10617 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 10618 .addReg(MI->getOperand(ValOps+1).getReg()); 10619 10620 // The instruction doesn't actually take any operands though. 10621 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 10622 10623 MI->eraseFromParent(); // The pseudo is gone now. 10624 return BB; 10625} 10626 10627MachineBasicBlock * 10628X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 10629 DebugLoc dl = MI->getDebugLoc(); 10630 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10631 10632 // First arg in ECX, the second in EAX. 10633 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10634 .addReg(MI->getOperand(0).getReg()); 10635 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 10636 .addReg(MI->getOperand(1).getReg()); 10637 10638 // The instruction doesn't actually take any operands though. 10639 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 10640 10641 MI->eraseFromParent(); // The pseudo is gone now. 10642 return BB; 10643} 10644 10645MachineBasicBlock * 10646X86TargetLowering::EmitVAARG64WithCustomInserter( 10647 MachineInstr *MI, 10648 MachineBasicBlock *MBB) const { 10649 // Emit va_arg instruction on X86-64. 10650 10651 // Operands to this pseudo-instruction: 10652 // 0 ) Output : destination address (reg) 10653 // 1-5) Input : va_list address (addr, i64mem) 10654 // 6 ) ArgSize : Size (in bytes) of vararg type 10655 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10656 // 8 ) Align : Alignment of type 10657 // 9 ) EFLAGS (implicit-def) 10658 10659 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10660 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10661 10662 unsigned DestReg = MI->getOperand(0).getReg(); 10663 MachineOperand &Base = MI->getOperand(1); 10664 MachineOperand &Scale = MI->getOperand(2); 10665 MachineOperand &Index = MI->getOperand(3); 10666 MachineOperand &Disp = MI->getOperand(4); 10667 MachineOperand &Segment = MI->getOperand(5); 10668 unsigned ArgSize = MI->getOperand(6).getImm(); 10669 unsigned ArgMode = MI->getOperand(7).getImm(); 10670 unsigned Align = MI->getOperand(8).getImm(); 10671 10672 // Memory Reference 10673 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10674 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10675 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10676 10677 // Machine Information 10678 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10679 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10680 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10681 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10682 DebugLoc DL = MI->getDebugLoc(); 10683 10684 // struct va_list { 10685 // i32 gp_offset 10686 // i32 fp_offset 10687 // i64 overflow_area (address) 10688 // i64 reg_save_area (address) 10689 // } 10690 // sizeof(va_list) = 24 10691 // alignment(va_list) = 8 10692 10693 unsigned TotalNumIntRegs = 6; 10694 unsigned TotalNumXMMRegs = 8; 10695 bool UseGPOffset = (ArgMode == 1); 10696 bool UseFPOffset = (ArgMode == 2); 10697 unsigned MaxOffset = TotalNumIntRegs * 8 + 10698 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10699 10700 /* Align ArgSize to a multiple of 8 */ 10701 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10702 bool NeedsAlign = (Align > 8); 10703 10704 MachineBasicBlock *thisMBB = MBB; 10705 MachineBasicBlock *overflowMBB; 10706 MachineBasicBlock *offsetMBB; 10707 MachineBasicBlock *endMBB; 10708 10709 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10710 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10711 unsigned OffsetReg = 0; 10712 10713 if (!UseGPOffset && !UseFPOffset) { 10714 // If we only pull from the overflow region, we don't create a branch. 10715 // We don't need to alter control flow. 10716 OffsetDestReg = 0; // unused 10717 OverflowDestReg = DestReg; 10718 10719 offsetMBB = NULL; 10720 overflowMBB = thisMBB; 10721 endMBB = thisMBB; 10722 } else { 10723 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10724 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10725 // If not, pull from overflow_area. (branch to overflowMBB) 10726 // 10727 // thisMBB 10728 // | . 10729 // | . 10730 // offsetMBB overflowMBB 10731 // | . 10732 // | . 10733 // endMBB 10734 10735 // Registers for the PHI in endMBB 10736 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10737 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10738 10739 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10740 MachineFunction *MF = MBB->getParent(); 10741 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10742 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10743 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10744 10745 MachineFunction::iterator MBBIter = MBB; 10746 ++MBBIter; 10747 10748 // Insert the new basic blocks 10749 MF->insert(MBBIter, offsetMBB); 10750 MF->insert(MBBIter, overflowMBB); 10751 MF->insert(MBBIter, endMBB); 10752 10753 // Transfer the remainder of MBB and its successor edges to endMBB. 10754 endMBB->splice(endMBB->begin(), thisMBB, 10755 llvm::next(MachineBasicBlock::iterator(MI)), 10756 thisMBB->end()); 10757 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10758 10759 // Make offsetMBB and overflowMBB successors of thisMBB 10760 thisMBB->addSuccessor(offsetMBB); 10761 thisMBB->addSuccessor(overflowMBB); 10762 10763 // endMBB is a successor of both offsetMBB and overflowMBB 10764 offsetMBB->addSuccessor(endMBB); 10765 overflowMBB->addSuccessor(endMBB); 10766 10767 // Load the offset value into a register 10768 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10769 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10770 .addOperand(Base) 10771 .addOperand(Scale) 10772 .addOperand(Index) 10773 .addDisp(Disp, UseFPOffset ? 4 : 0) 10774 .addOperand(Segment) 10775 .setMemRefs(MMOBegin, MMOEnd); 10776 10777 // Check if there is enough room left to pull this argument. 10778 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10779 .addReg(OffsetReg) 10780 .addImm(MaxOffset + 8 - ArgSizeA8); 10781 10782 // Branch to "overflowMBB" if offset >= max 10783 // Fall through to "offsetMBB" otherwise 10784 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10785 .addMBB(overflowMBB); 10786 } 10787 10788 // In offsetMBB, emit code to use the reg_save_area. 10789 if (offsetMBB) { 10790 assert(OffsetReg != 0); 10791 10792 // Read the reg_save_area address. 10793 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10794 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10795 .addOperand(Base) 10796 .addOperand(Scale) 10797 .addOperand(Index) 10798 .addDisp(Disp, 16) 10799 .addOperand(Segment) 10800 .setMemRefs(MMOBegin, MMOEnd); 10801 10802 // Zero-extend the offset 10803 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 10804 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 10805 .addImm(0) 10806 .addReg(OffsetReg) 10807 .addImm(X86::sub_32bit); 10808 10809 // Add the offset to the reg_save_area to get the final address. 10810 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 10811 .addReg(OffsetReg64) 10812 .addReg(RegSaveReg); 10813 10814 // Compute the offset for the next argument 10815 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10816 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 10817 .addReg(OffsetReg) 10818 .addImm(UseFPOffset ? 16 : 8); 10819 10820 // Store it back into the va_list. 10821 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 10822 .addOperand(Base) 10823 .addOperand(Scale) 10824 .addOperand(Index) 10825 .addDisp(Disp, UseFPOffset ? 4 : 0) 10826 .addOperand(Segment) 10827 .addReg(NextOffsetReg) 10828 .setMemRefs(MMOBegin, MMOEnd); 10829 10830 // Jump to endMBB 10831 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 10832 .addMBB(endMBB); 10833 } 10834 10835 // 10836 // Emit code to use overflow area 10837 // 10838 10839 // Load the overflow_area address into a register. 10840 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 10841 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 10842 .addOperand(Base) 10843 .addOperand(Scale) 10844 .addOperand(Index) 10845 .addDisp(Disp, 8) 10846 .addOperand(Segment) 10847 .setMemRefs(MMOBegin, MMOEnd); 10848 10849 // If we need to align it, do so. Otherwise, just copy the address 10850 // to OverflowDestReg. 10851 if (NeedsAlign) { 10852 // Align the overflow address 10853 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 10854 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 10855 10856 // aligned_addr = (addr + (align-1)) & ~(align-1) 10857 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 10858 .addReg(OverflowAddrReg) 10859 .addImm(Align-1); 10860 10861 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 10862 .addReg(TmpReg) 10863 .addImm(~(uint64_t)(Align-1)); 10864 } else { 10865 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 10866 .addReg(OverflowAddrReg); 10867 } 10868 10869 // Compute the next overflow address after this argument. 10870 // (the overflow address should be kept 8-byte aligned) 10871 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 10872 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 10873 .addReg(OverflowDestReg) 10874 .addImm(ArgSizeA8); 10875 10876 // Store the new overflow address. 10877 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 10878 .addOperand(Base) 10879 .addOperand(Scale) 10880 .addOperand(Index) 10881 .addDisp(Disp, 8) 10882 .addOperand(Segment) 10883 .addReg(NextAddrReg) 10884 .setMemRefs(MMOBegin, MMOEnd); 10885 10886 // If we branched, emit the PHI to the front of endMBB. 10887 if (offsetMBB) { 10888 BuildMI(*endMBB, endMBB->begin(), DL, 10889 TII->get(X86::PHI), DestReg) 10890 .addReg(OffsetDestReg).addMBB(offsetMBB) 10891 .addReg(OverflowDestReg).addMBB(overflowMBB); 10892 } 10893 10894 // Erase the pseudo instruction 10895 MI->eraseFromParent(); 10896 10897 return endMBB; 10898} 10899 10900MachineBasicBlock * 10901X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 10902 MachineInstr *MI, 10903 MachineBasicBlock *MBB) const { 10904 // Emit code to save XMM registers to the stack. The ABI says that the 10905 // number of registers to save is given in %al, so it's theoretically 10906 // possible to do an indirect jump trick to avoid saving all of them, 10907 // however this code takes a simpler approach and just executes all 10908 // of the stores if %al is non-zero. It's less code, and it's probably 10909 // easier on the hardware branch predictor, and stores aren't all that 10910 // expensive anyway. 10911 10912 // Create the new basic blocks. One block contains all the XMM stores, 10913 // and one block is the final destination regardless of whether any 10914 // stores were performed. 10915 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10916 MachineFunction *F = MBB->getParent(); 10917 MachineFunction::iterator MBBIter = MBB; 10918 ++MBBIter; 10919 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 10920 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 10921 F->insert(MBBIter, XMMSaveMBB); 10922 F->insert(MBBIter, EndMBB); 10923 10924 // Transfer the remainder of MBB and its successor edges to EndMBB. 10925 EndMBB->splice(EndMBB->begin(), MBB, 10926 llvm::next(MachineBasicBlock::iterator(MI)), 10927 MBB->end()); 10928 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 10929 10930 // The original block will now fall through to the XMM save block. 10931 MBB->addSuccessor(XMMSaveMBB); 10932 // The XMMSaveMBB will fall through to the end block. 10933 XMMSaveMBB->addSuccessor(EndMBB); 10934 10935 // Now add the instructions. 10936 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10937 DebugLoc DL = MI->getDebugLoc(); 10938 10939 unsigned CountReg = MI->getOperand(0).getReg(); 10940 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 10941 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 10942 10943 if (!Subtarget->isTargetWin64()) { 10944 // If %al is 0, branch around the XMM save block. 10945 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 10946 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 10947 MBB->addSuccessor(EndMBB); 10948 } 10949 10950 // In the XMM save block, save all the XMM argument registers. 10951 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 10952 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 10953 MachineMemOperand *MMO = 10954 F->getMachineMemOperand( 10955 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 10956 MachineMemOperand::MOStore, 10957 /*Size=*/16, /*Align=*/16); 10958 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 10959 .addFrameIndex(RegSaveFrameIndex) 10960 .addImm(/*Scale=*/1) 10961 .addReg(/*IndexReg=*/0) 10962 .addImm(/*Disp=*/Offset) 10963 .addReg(/*Segment=*/0) 10964 .addReg(MI->getOperand(i).getReg()) 10965 .addMemOperand(MMO); 10966 } 10967 10968 MI->eraseFromParent(); // The pseudo instruction is gone now. 10969 10970 return EndMBB; 10971} 10972 10973MachineBasicBlock * 10974X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10975 MachineBasicBlock *BB) const { 10976 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10977 DebugLoc DL = MI->getDebugLoc(); 10978 10979 // To "insert" a SELECT_CC instruction, we actually have to insert the 10980 // diamond control-flow pattern. The incoming instruction knows the 10981 // destination vreg to set, the condition code register to branch on, the 10982 // true/false values to select between, and a branch opcode to use. 10983 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10984 MachineFunction::iterator It = BB; 10985 ++It; 10986 10987 // thisMBB: 10988 // ... 10989 // TrueVal = ... 10990 // cmpTY ccX, r1, r2 10991 // bCC copy1MBB 10992 // fallthrough --> copy0MBB 10993 MachineBasicBlock *thisMBB = BB; 10994 MachineFunction *F = BB->getParent(); 10995 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10996 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10997 F->insert(It, copy0MBB); 10998 F->insert(It, sinkMBB); 10999 11000 // If the EFLAGS register isn't dead in the terminator, then claim that it's 11001 // live into the sink and copy blocks. 11002 const MachineFunction *MF = BB->getParent(); 11003 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 11004 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 11005 11006 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 11007 const MachineOperand &MO = MI->getOperand(I); 11008 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 11009 unsigned Reg = MO.getReg(); 11010 if (Reg != X86::EFLAGS) continue; 11011 copy0MBB->addLiveIn(Reg); 11012 sinkMBB->addLiveIn(Reg); 11013 } 11014 11015 // Transfer the remainder of BB and its successor edges to sinkMBB. 11016 sinkMBB->splice(sinkMBB->begin(), BB, 11017 llvm::next(MachineBasicBlock::iterator(MI)), 11018 BB->end()); 11019 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11020 11021 // Add the true and fallthrough blocks as its successors. 11022 BB->addSuccessor(copy0MBB); 11023 BB->addSuccessor(sinkMBB); 11024 11025 // Create the conditional branch instruction. 11026 unsigned Opc = 11027 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 11028 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 11029 11030 // copy0MBB: 11031 // %FalseValue = ... 11032 // # fallthrough to sinkMBB 11033 copy0MBB->addSuccessor(sinkMBB); 11034 11035 // sinkMBB: 11036 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11037 // ... 11038 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 11039 TII->get(X86::PHI), MI->getOperand(0).getReg()) 11040 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 11041 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 11042 11043 MI->eraseFromParent(); // The pseudo instruction is gone now. 11044 return sinkMBB; 11045} 11046 11047MachineBasicBlock * 11048X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 11049 MachineBasicBlock *BB) const { 11050 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11051 DebugLoc DL = MI->getDebugLoc(); 11052 11053 assert(!Subtarget->isTargetEnvMacho()); 11054 11055 // The lowering is pretty easy: we're just emitting the call to _alloca. The 11056 // non-trivial part is impdef of ESP. 11057 11058 if (Subtarget->isTargetWin64()) { 11059 if (Subtarget->isTargetCygMing()) { 11060 // ___chkstk(Mingw64): 11061 // Clobbers R10, R11, RAX and EFLAGS. 11062 // Updates RSP. 11063 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11064 .addExternalSymbol("___chkstk") 11065 .addReg(X86::RAX, RegState::Implicit) 11066 .addReg(X86::RSP, RegState::Implicit) 11067 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 11068 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 11069 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11070 } else { 11071 // __chkstk(MSVCRT): does not update stack pointer. 11072 // Clobbers R10, R11 and EFLAGS. 11073 // FIXME: RAX(allocated size) might be reused and not killed. 11074 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11075 .addExternalSymbol("__chkstk") 11076 .addReg(X86::RAX, RegState::Implicit) 11077 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11078 // RAX has the offset to subtracted from RSP. 11079 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 11080 .addReg(X86::RSP) 11081 .addReg(X86::RAX); 11082 } 11083 } else { 11084 const char *StackProbeSymbol = 11085 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 11086 11087 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 11088 .addExternalSymbol(StackProbeSymbol) 11089 .addReg(X86::EAX, RegState::Implicit) 11090 .addReg(X86::ESP, RegState::Implicit) 11091 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 11092 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 11093 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11094 } 11095 11096 MI->eraseFromParent(); // The pseudo instruction is gone now. 11097 return BB; 11098} 11099 11100MachineBasicBlock * 11101X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 11102 MachineBasicBlock *BB) const { 11103 // This is pretty easy. We're taking the value that we received from 11104 // our load from the relocation, sticking it in either RDI (x86-64) 11105 // or EAX and doing an indirect call. The return value will then 11106 // be in the normal return register. 11107 const X86InstrInfo *TII 11108 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 11109 DebugLoc DL = MI->getDebugLoc(); 11110 MachineFunction *F = BB->getParent(); 11111 11112 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 11113 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 11114 11115 if (Subtarget->is64Bit()) { 11116 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11117 TII->get(X86::MOV64rm), X86::RDI) 11118 .addReg(X86::RIP) 11119 .addImm(0).addReg(0) 11120 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11121 MI->getOperand(3).getTargetFlags()) 11122 .addReg(0); 11123 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 11124 addDirectMem(MIB, X86::RDI); 11125 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 11126 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11127 TII->get(X86::MOV32rm), X86::EAX) 11128 .addReg(0) 11129 .addImm(0).addReg(0) 11130 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11131 MI->getOperand(3).getTargetFlags()) 11132 .addReg(0); 11133 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11134 addDirectMem(MIB, X86::EAX); 11135 } else { 11136 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11137 TII->get(X86::MOV32rm), X86::EAX) 11138 .addReg(TII->getGlobalBaseReg(F)) 11139 .addImm(0).addReg(0) 11140 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11141 MI->getOperand(3).getTargetFlags()) 11142 .addReg(0); 11143 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11144 addDirectMem(MIB, X86::EAX); 11145 } 11146 11147 MI->eraseFromParent(); // The pseudo instruction is gone now. 11148 return BB; 11149} 11150 11151MachineBasicBlock * 11152X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 11153 MachineBasicBlock *BB) const { 11154 switch (MI->getOpcode()) { 11155 default: assert(false && "Unexpected instr type to insert"); 11156 case X86::TAILJMPd64: 11157 case X86::TAILJMPr64: 11158 case X86::TAILJMPm64: 11159 assert(!"TAILJMP64 would not be touched here."); 11160 case X86::TCRETURNdi64: 11161 case X86::TCRETURNri64: 11162 case X86::TCRETURNmi64: 11163 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 11164 // On AMD64, additional defs should be added before register allocation. 11165 if (!Subtarget->isTargetWin64()) { 11166 MI->addRegisterDefined(X86::RSI); 11167 MI->addRegisterDefined(X86::RDI); 11168 MI->addRegisterDefined(X86::XMM6); 11169 MI->addRegisterDefined(X86::XMM7); 11170 MI->addRegisterDefined(X86::XMM8); 11171 MI->addRegisterDefined(X86::XMM9); 11172 MI->addRegisterDefined(X86::XMM10); 11173 MI->addRegisterDefined(X86::XMM11); 11174 MI->addRegisterDefined(X86::XMM12); 11175 MI->addRegisterDefined(X86::XMM13); 11176 MI->addRegisterDefined(X86::XMM14); 11177 MI->addRegisterDefined(X86::XMM15); 11178 } 11179 return BB; 11180 case X86::WIN_ALLOCA: 11181 return EmitLoweredWinAlloca(MI, BB); 11182 case X86::TLSCall_32: 11183 case X86::TLSCall_64: 11184 return EmitLoweredTLSCall(MI, BB); 11185 case X86::CMOV_GR8: 11186 case X86::CMOV_FR32: 11187 case X86::CMOV_FR64: 11188 case X86::CMOV_V4F32: 11189 case X86::CMOV_V2F64: 11190 case X86::CMOV_V2I64: 11191 case X86::CMOV_GR16: 11192 case X86::CMOV_GR32: 11193 case X86::CMOV_RFP32: 11194 case X86::CMOV_RFP64: 11195 case X86::CMOV_RFP80: 11196 return EmitLoweredSelect(MI, BB); 11197 11198 case X86::FP32_TO_INT16_IN_MEM: 11199 case X86::FP32_TO_INT32_IN_MEM: 11200 case X86::FP32_TO_INT64_IN_MEM: 11201 case X86::FP64_TO_INT16_IN_MEM: 11202 case X86::FP64_TO_INT32_IN_MEM: 11203 case X86::FP64_TO_INT64_IN_MEM: 11204 case X86::FP80_TO_INT16_IN_MEM: 11205 case X86::FP80_TO_INT32_IN_MEM: 11206 case X86::FP80_TO_INT64_IN_MEM: { 11207 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11208 DebugLoc DL = MI->getDebugLoc(); 11209 11210 // Change the floating point control register to use "round towards zero" 11211 // mode when truncating to an integer value. 11212 MachineFunction *F = BB->getParent(); 11213 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 11214 addFrameReference(BuildMI(*BB, MI, DL, 11215 TII->get(X86::FNSTCW16m)), CWFrameIdx); 11216 11217 // Load the old value of the high byte of the control word... 11218 unsigned OldCW = 11219 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 11220 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 11221 CWFrameIdx); 11222 11223 // Set the high part to be round to zero... 11224 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 11225 .addImm(0xC7F); 11226 11227 // Reload the modified control word now... 11228 addFrameReference(BuildMI(*BB, MI, DL, 11229 TII->get(X86::FLDCW16m)), CWFrameIdx); 11230 11231 // Restore the memory image of control word to original value 11232 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 11233 .addReg(OldCW); 11234 11235 // Get the X86 opcode to use. 11236 unsigned Opc; 11237 switch (MI->getOpcode()) { 11238 default: llvm_unreachable("illegal opcode!"); 11239 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 11240 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 11241 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 11242 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 11243 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 11244 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 11245 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 11246 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 11247 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 11248 } 11249 11250 X86AddressMode AM; 11251 MachineOperand &Op = MI->getOperand(0); 11252 if (Op.isReg()) { 11253 AM.BaseType = X86AddressMode::RegBase; 11254 AM.Base.Reg = Op.getReg(); 11255 } else { 11256 AM.BaseType = X86AddressMode::FrameIndexBase; 11257 AM.Base.FrameIndex = Op.getIndex(); 11258 } 11259 Op = MI->getOperand(1); 11260 if (Op.isImm()) 11261 AM.Scale = Op.getImm(); 11262 Op = MI->getOperand(2); 11263 if (Op.isImm()) 11264 AM.IndexReg = Op.getImm(); 11265 Op = MI->getOperand(3); 11266 if (Op.isGlobal()) { 11267 AM.GV = Op.getGlobal(); 11268 } else { 11269 AM.Disp = Op.getImm(); 11270 } 11271 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 11272 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 11273 11274 // Reload the original control word now. 11275 addFrameReference(BuildMI(*BB, MI, DL, 11276 TII->get(X86::FLDCW16m)), CWFrameIdx); 11277 11278 MI->eraseFromParent(); // The pseudo instruction is gone now. 11279 return BB; 11280 } 11281 // String/text processing lowering. 11282 case X86::PCMPISTRM128REG: 11283 case X86::VPCMPISTRM128REG: 11284 return EmitPCMP(MI, BB, 3, false /* in-mem */); 11285 case X86::PCMPISTRM128MEM: 11286 case X86::VPCMPISTRM128MEM: 11287 return EmitPCMP(MI, BB, 3, true /* in-mem */); 11288 case X86::PCMPESTRM128REG: 11289 case X86::VPCMPESTRM128REG: 11290 return EmitPCMP(MI, BB, 5, false /* in mem */); 11291 case X86::PCMPESTRM128MEM: 11292 case X86::VPCMPESTRM128MEM: 11293 return EmitPCMP(MI, BB, 5, true /* in mem */); 11294 11295 // Thread synchronization. 11296 case X86::MONITOR: 11297 return EmitMonitor(MI, BB); 11298 case X86::MWAIT: 11299 return EmitMwait(MI, BB); 11300 11301 // Atomic Lowering. 11302 case X86::ATOMAND32: 11303 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11304 X86::AND32ri, X86::MOV32rm, 11305 X86::LCMPXCHG32, 11306 X86::NOT32r, X86::EAX, 11307 X86::GR32RegisterClass); 11308 case X86::ATOMOR32: 11309 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 11310 X86::OR32ri, X86::MOV32rm, 11311 X86::LCMPXCHG32, 11312 X86::NOT32r, X86::EAX, 11313 X86::GR32RegisterClass); 11314 case X86::ATOMXOR32: 11315 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 11316 X86::XOR32ri, X86::MOV32rm, 11317 X86::LCMPXCHG32, 11318 X86::NOT32r, X86::EAX, 11319 X86::GR32RegisterClass); 11320 case X86::ATOMNAND32: 11321 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11322 X86::AND32ri, X86::MOV32rm, 11323 X86::LCMPXCHG32, 11324 X86::NOT32r, X86::EAX, 11325 X86::GR32RegisterClass, true); 11326 case X86::ATOMMIN32: 11327 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 11328 case X86::ATOMMAX32: 11329 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 11330 case X86::ATOMUMIN32: 11331 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 11332 case X86::ATOMUMAX32: 11333 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 11334 11335 case X86::ATOMAND16: 11336 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11337 X86::AND16ri, X86::MOV16rm, 11338 X86::LCMPXCHG16, 11339 X86::NOT16r, X86::AX, 11340 X86::GR16RegisterClass); 11341 case X86::ATOMOR16: 11342 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 11343 X86::OR16ri, X86::MOV16rm, 11344 X86::LCMPXCHG16, 11345 X86::NOT16r, X86::AX, 11346 X86::GR16RegisterClass); 11347 case X86::ATOMXOR16: 11348 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 11349 X86::XOR16ri, X86::MOV16rm, 11350 X86::LCMPXCHG16, 11351 X86::NOT16r, X86::AX, 11352 X86::GR16RegisterClass); 11353 case X86::ATOMNAND16: 11354 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11355 X86::AND16ri, X86::MOV16rm, 11356 X86::LCMPXCHG16, 11357 X86::NOT16r, X86::AX, 11358 X86::GR16RegisterClass, true); 11359 case X86::ATOMMIN16: 11360 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 11361 case X86::ATOMMAX16: 11362 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 11363 case X86::ATOMUMIN16: 11364 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 11365 case X86::ATOMUMAX16: 11366 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 11367 11368 case X86::ATOMAND8: 11369 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11370 X86::AND8ri, X86::MOV8rm, 11371 X86::LCMPXCHG8, 11372 X86::NOT8r, X86::AL, 11373 X86::GR8RegisterClass); 11374 case X86::ATOMOR8: 11375 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 11376 X86::OR8ri, X86::MOV8rm, 11377 X86::LCMPXCHG8, 11378 X86::NOT8r, X86::AL, 11379 X86::GR8RegisterClass); 11380 case X86::ATOMXOR8: 11381 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 11382 X86::XOR8ri, X86::MOV8rm, 11383 X86::LCMPXCHG8, 11384 X86::NOT8r, X86::AL, 11385 X86::GR8RegisterClass); 11386 case X86::ATOMNAND8: 11387 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11388 X86::AND8ri, X86::MOV8rm, 11389 X86::LCMPXCHG8, 11390 X86::NOT8r, X86::AL, 11391 X86::GR8RegisterClass, true); 11392 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 11393 // This group is for 64-bit host. 11394 case X86::ATOMAND64: 11395 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11396 X86::AND64ri32, X86::MOV64rm, 11397 X86::LCMPXCHG64, 11398 X86::NOT64r, X86::RAX, 11399 X86::GR64RegisterClass); 11400 case X86::ATOMOR64: 11401 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 11402 X86::OR64ri32, X86::MOV64rm, 11403 X86::LCMPXCHG64, 11404 X86::NOT64r, X86::RAX, 11405 X86::GR64RegisterClass); 11406 case X86::ATOMXOR64: 11407 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 11408 X86::XOR64ri32, X86::MOV64rm, 11409 X86::LCMPXCHG64, 11410 X86::NOT64r, X86::RAX, 11411 X86::GR64RegisterClass); 11412 case X86::ATOMNAND64: 11413 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11414 X86::AND64ri32, X86::MOV64rm, 11415 X86::LCMPXCHG64, 11416 X86::NOT64r, X86::RAX, 11417 X86::GR64RegisterClass, true); 11418 case X86::ATOMMIN64: 11419 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 11420 case X86::ATOMMAX64: 11421 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 11422 case X86::ATOMUMIN64: 11423 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 11424 case X86::ATOMUMAX64: 11425 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 11426 11427 // This group does 64-bit operations on a 32-bit host. 11428 case X86::ATOMAND6432: 11429 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11430 X86::AND32rr, X86::AND32rr, 11431 X86::AND32ri, X86::AND32ri, 11432 false); 11433 case X86::ATOMOR6432: 11434 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11435 X86::OR32rr, X86::OR32rr, 11436 X86::OR32ri, X86::OR32ri, 11437 false); 11438 case X86::ATOMXOR6432: 11439 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11440 X86::XOR32rr, X86::XOR32rr, 11441 X86::XOR32ri, X86::XOR32ri, 11442 false); 11443 case X86::ATOMNAND6432: 11444 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11445 X86::AND32rr, X86::AND32rr, 11446 X86::AND32ri, X86::AND32ri, 11447 true); 11448 case X86::ATOMADD6432: 11449 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11450 X86::ADD32rr, X86::ADC32rr, 11451 X86::ADD32ri, X86::ADC32ri, 11452 false); 11453 case X86::ATOMSUB6432: 11454 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11455 X86::SUB32rr, X86::SBB32rr, 11456 X86::SUB32ri, X86::SBB32ri, 11457 false); 11458 case X86::ATOMSWAP6432: 11459 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11460 X86::MOV32rr, X86::MOV32rr, 11461 X86::MOV32ri, X86::MOV32ri, 11462 false); 11463 case X86::VASTART_SAVE_XMM_REGS: 11464 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 11465 11466 case X86::VAARG_64: 11467 return EmitVAARG64WithCustomInserter(MI, BB); 11468 } 11469} 11470 11471//===----------------------------------------------------------------------===// 11472// X86 Optimization Hooks 11473//===----------------------------------------------------------------------===// 11474 11475void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 11476 const APInt &Mask, 11477 APInt &KnownZero, 11478 APInt &KnownOne, 11479 const SelectionDAG &DAG, 11480 unsigned Depth) const { 11481 unsigned Opc = Op.getOpcode(); 11482 assert((Opc >= ISD::BUILTIN_OP_END || 11483 Opc == ISD::INTRINSIC_WO_CHAIN || 11484 Opc == ISD::INTRINSIC_W_CHAIN || 11485 Opc == ISD::INTRINSIC_VOID) && 11486 "Should use MaskedValueIsZero if you don't know whether Op" 11487 " is a target node!"); 11488 11489 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 11490 switch (Opc) { 11491 default: break; 11492 case X86ISD::ADD: 11493 case X86ISD::SUB: 11494 case X86ISD::ADC: 11495 case X86ISD::SBB: 11496 case X86ISD::SMUL: 11497 case X86ISD::UMUL: 11498 case X86ISD::INC: 11499 case X86ISD::DEC: 11500 case X86ISD::OR: 11501 case X86ISD::XOR: 11502 case X86ISD::AND: 11503 // These nodes' second result is a boolean. 11504 if (Op.getResNo() == 0) 11505 break; 11506 // Fallthrough 11507 case X86ISD::SETCC: 11508 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 11509 Mask.getBitWidth() - 1); 11510 break; 11511 } 11512} 11513 11514unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 11515 unsigned Depth) const { 11516 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 11517 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 11518 return Op.getValueType().getScalarType().getSizeInBits(); 11519 11520 // Fallback case. 11521 return 1; 11522} 11523 11524/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 11525/// node is a GlobalAddress + offset. 11526bool X86TargetLowering::isGAPlusOffset(SDNode *N, 11527 const GlobalValue* &GA, 11528 int64_t &Offset) const { 11529 if (N->getOpcode() == X86ISD::Wrapper) { 11530 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 11531 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 11532 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 11533 return true; 11534 } 11535 } 11536 return TargetLowering::isGAPlusOffset(N, GA, Offset); 11537} 11538 11539/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 11540static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 11541 TargetLowering::DAGCombinerInfo &DCI) { 11542 DebugLoc dl = N->getDebugLoc(); 11543 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 11544 SDValue V1 = SVOp->getOperand(0); 11545 SDValue V2 = SVOp->getOperand(1); 11546 EVT VT = SVOp->getValueType(0); 11547 11548 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 11549 V2.getOpcode() == ISD::CONCAT_VECTORS) { 11550 // 11551 // 0,0,0,... 11552 // | 11553 // V UNDEF BUILD_VECTOR UNDEF 11554 // \ / \ / 11555 // CONCAT_VECTOR CONCAT_VECTOR 11556 // \ / 11557 // \ / 11558 // RESULT: V + zero extended 11559 // 11560 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 11561 V2.getOperand(1).getOpcode() != ISD::UNDEF || 11562 V1.getOperand(1).getOpcode() != ISD::UNDEF) 11563 return SDValue(); 11564 11565 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 11566 return SDValue(); 11567 11568 // To match the shuffle mask, the first half of the mask should 11569 // be exactly the first vector, and all the rest a splat with the 11570 // first element of the second one. 11571 int NumElems = VT.getVectorNumElements(); 11572 for (int i = 0; i < NumElems/2; ++i) 11573 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 11574 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 11575 return SDValue(); 11576 11577 // Emit a zeroed vector and insert the desired subvector on its 11578 // first half. 11579 SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); 11580 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 11581 DAG.getConstant(0, MVT::i32), DAG, dl); 11582 return DCI.CombineTo(N, InsV); 11583 } 11584 11585 return SDValue(); 11586} 11587 11588/// PerformShuffleCombine - Performs several different shuffle combines. 11589static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 11590 TargetLowering::DAGCombinerInfo &DCI) { 11591 DebugLoc dl = N->getDebugLoc(); 11592 EVT VT = N->getValueType(0); 11593 11594 // Don't create instructions with illegal types after legalize types has run. 11595 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11596 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 11597 return SDValue(); 11598 11599 // Only handle pure VECTOR_SHUFFLE nodes. 11600 if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) 11601 return PerformShuffleCombine256(N, DAG, DCI); 11602 11603 // Only handle 128 wide vector from here on. 11604 if (VT.getSizeInBits() != 128) 11605 return SDValue(); 11606 11607 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 11608 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 11609 // consecutive, non-overlapping, and in the right order. 11610 SmallVector<SDValue, 16> Elts; 11611 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 11612 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 11613 11614 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 11615} 11616 11617/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 11618/// generation and convert it from being a bunch of shuffles and extracts 11619/// to a simple store and scalar loads to extract the elements. 11620static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 11621 const TargetLowering &TLI) { 11622 SDValue InputVector = N->getOperand(0); 11623 11624 // Only operate on vectors of 4 elements, where the alternative shuffling 11625 // gets to be more expensive. 11626 if (InputVector.getValueType() != MVT::v4i32) 11627 return SDValue(); 11628 11629 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 11630 // single use which is a sign-extend or zero-extend, and all elements are 11631 // used. 11632 SmallVector<SDNode *, 4> Uses; 11633 unsigned ExtractedElements = 0; 11634 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 11635 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 11636 if (UI.getUse().getResNo() != InputVector.getResNo()) 11637 return SDValue(); 11638 11639 SDNode *Extract = *UI; 11640 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11641 return SDValue(); 11642 11643 if (Extract->getValueType(0) != MVT::i32) 11644 return SDValue(); 11645 if (!Extract->hasOneUse()) 11646 return SDValue(); 11647 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 11648 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 11649 return SDValue(); 11650 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 11651 return SDValue(); 11652 11653 // Record which element was extracted. 11654 ExtractedElements |= 11655 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 11656 11657 Uses.push_back(Extract); 11658 } 11659 11660 // If not all the elements were used, this may not be worthwhile. 11661 if (ExtractedElements != 15) 11662 return SDValue(); 11663 11664 // Ok, we've now decided to do the transformation. 11665 DebugLoc dl = InputVector.getDebugLoc(); 11666 11667 // Store the value to a temporary stack slot. 11668 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 11669 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 11670 MachinePointerInfo(), false, false, 0); 11671 11672 // Replace each use (extract) with a load of the appropriate element. 11673 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 11674 UE = Uses.end(); UI != UE; ++UI) { 11675 SDNode *Extract = *UI; 11676 11677 // cOMpute the element's address. 11678 SDValue Idx = Extract->getOperand(1); 11679 unsigned EltSize = 11680 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 11681 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 11682 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 11683 11684 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 11685 StackPtr, OffsetVal); 11686 11687 // Load the scalar. 11688 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 11689 ScalarAddr, MachinePointerInfo(), 11690 false, false, 0); 11691 11692 // Replace the exact with the load. 11693 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 11694 } 11695 11696 // The replacement was made in place; don't return anything. 11697 return SDValue(); 11698} 11699 11700/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 11701static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 11702 const X86Subtarget *Subtarget) { 11703 DebugLoc DL = N->getDebugLoc(); 11704 SDValue Cond = N->getOperand(0); 11705 // Get the LHS/RHS of the select. 11706 SDValue LHS = N->getOperand(1); 11707 SDValue RHS = N->getOperand(2); 11708 11709 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 11710 // instructions match the semantics of the common C idiom x<y?x:y but not 11711 // x<=y?x:y, because of how they handle negative zero (which can be 11712 // ignored in unsafe-math mode). 11713 if (Subtarget->hasSSE2() && 11714 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 11715 Cond.getOpcode() == ISD::SETCC) { 11716 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 11717 11718 unsigned Opcode = 0; 11719 // Check for x CC y ? x : y. 11720 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 11721 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 11722 switch (CC) { 11723 default: break; 11724 case ISD::SETULT: 11725 // Converting this to a min would handle NaNs incorrectly, and swapping 11726 // the operands would cause it to handle comparisons between positive 11727 // and negative zero incorrectly. 11728 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11729 if (!UnsafeFPMath && 11730 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11731 break; 11732 std::swap(LHS, RHS); 11733 } 11734 Opcode = X86ISD::FMIN; 11735 break; 11736 case ISD::SETOLE: 11737 // Converting this to a min would handle comparisons between positive 11738 // and negative zero incorrectly. 11739 if (!UnsafeFPMath && 11740 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11741 break; 11742 Opcode = X86ISD::FMIN; 11743 break; 11744 case ISD::SETULE: 11745 // Converting this to a min would handle both negative zeros and NaNs 11746 // incorrectly, but we can swap the operands to fix both. 11747 std::swap(LHS, RHS); 11748 case ISD::SETOLT: 11749 case ISD::SETLT: 11750 case ISD::SETLE: 11751 Opcode = X86ISD::FMIN; 11752 break; 11753 11754 case ISD::SETOGE: 11755 // Converting this to a max would handle comparisons between positive 11756 // and negative zero incorrectly. 11757 if (!UnsafeFPMath && 11758 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11759 break; 11760 Opcode = X86ISD::FMAX; 11761 break; 11762 case ISD::SETUGT: 11763 // Converting this to a max would handle NaNs incorrectly, and swapping 11764 // the operands would cause it to handle comparisons between positive 11765 // and negative zero incorrectly. 11766 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11767 if (!UnsafeFPMath && 11768 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11769 break; 11770 std::swap(LHS, RHS); 11771 } 11772 Opcode = X86ISD::FMAX; 11773 break; 11774 case ISD::SETUGE: 11775 // Converting this to a max would handle both negative zeros and NaNs 11776 // incorrectly, but we can swap the operands to fix both. 11777 std::swap(LHS, RHS); 11778 case ISD::SETOGT: 11779 case ISD::SETGT: 11780 case ISD::SETGE: 11781 Opcode = X86ISD::FMAX; 11782 break; 11783 } 11784 // Check for x CC y ? y : x -- a min/max with reversed arms. 11785 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 11786 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 11787 switch (CC) { 11788 default: break; 11789 case ISD::SETOGE: 11790 // Converting this to a min would handle comparisons between positive 11791 // and negative zero incorrectly, and swapping the operands would 11792 // cause it to handle NaNs incorrectly. 11793 if (!UnsafeFPMath && 11794 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 11795 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11796 break; 11797 std::swap(LHS, RHS); 11798 } 11799 Opcode = X86ISD::FMIN; 11800 break; 11801 case ISD::SETUGT: 11802 // Converting this to a min would handle NaNs incorrectly. 11803 if (!UnsafeFPMath && 11804 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 11805 break; 11806 Opcode = X86ISD::FMIN; 11807 break; 11808 case ISD::SETUGE: 11809 // Converting this to a min would handle both negative zeros and NaNs 11810 // incorrectly, but we can swap the operands to fix both. 11811 std::swap(LHS, RHS); 11812 case ISD::SETOGT: 11813 case ISD::SETGT: 11814 case ISD::SETGE: 11815 Opcode = X86ISD::FMIN; 11816 break; 11817 11818 case ISD::SETULT: 11819 // Converting this to a max would handle NaNs incorrectly. 11820 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11821 break; 11822 Opcode = X86ISD::FMAX; 11823 break; 11824 case ISD::SETOLE: 11825 // Converting this to a max would handle comparisons between positive 11826 // and negative zero incorrectly, and swapping the operands would 11827 // cause it to handle NaNs incorrectly. 11828 if (!UnsafeFPMath && 11829 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 11830 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11831 break; 11832 std::swap(LHS, RHS); 11833 } 11834 Opcode = X86ISD::FMAX; 11835 break; 11836 case ISD::SETULE: 11837 // Converting this to a max would handle both negative zeros and NaNs 11838 // incorrectly, but we can swap the operands to fix both. 11839 std::swap(LHS, RHS); 11840 case ISD::SETOLT: 11841 case ISD::SETLT: 11842 case ISD::SETLE: 11843 Opcode = X86ISD::FMAX; 11844 break; 11845 } 11846 } 11847 11848 if (Opcode) 11849 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 11850 } 11851 11852 // If this is a select between two integer constants, try to do some 11853 // optimizations. 11854 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 11855 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 11856 // Don't do this for crazy integer types. 11857 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 11858 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 11859 // so that TrueC (the true value) is larger than FalseC. 11860 bool NeedsCondInvert = false; 11861 11862 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 11863 // Efficiently invertible. 11864 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 11865 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 11866 isa<ConstantSDNode>(Cond.getOperand(1))))) { 11867 NeedsCondInvert = true; 11868 std::swap(TrueC, FalseC); 11869 } 11870 11871 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 11872 if (FalseC->getAPIntValue() == 0 && 11873 TrueC->getAPIntValue().isPowerOf2()) { 11874 if (NeedsCondInvert) // Invert the condition if needed. 11875 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11876 DAG.getConstant(1, Cond.getValueType())); 11877 11878 // Zero extend the condition if needed. 11879 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 11880 11881 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11882 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 11883 DAG.getConstant(ShAmt, MVT::i8)); 11884 } 11885 11886 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 11887 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11888 if (NeedsCondInvert) // Invert the condition if needed. 11889 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11890 DAG.getConstant(1, Cond.getValueType())); 11891 11892 // Zero extend the condition if needed. 11893 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11894 FalseC->getValueType(0), Cond); 11895 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11896 SDValue(FalseC, 0)); 11897 } 11898 11899 // Optimize cases that will turn into an LEA instruction. This requires 11900 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11901 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11902 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11903 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11904 11905 bool isFastMultiplier = false; 11906 if (Diff < 10) { 11907 switch ((unsigned char)Diff) { 11908 default: break; 11909 case 1: // result = add base, cond 11910 case 2: // result = lea base( , cond*2) 11911 case 3: // result = lea base(cond, cond*2) 11912 case 4: // result = lea base( , cond*4) 11913 case 5: // result = lea base(cond, cond*4) 11914 case 8: // result = lea base( , cond*8) 11915 case 9: // result = lea base(cond, cond*8) 11916 isFastMultiplier = true; 11917 break; 11918 } 11919 } 11920 11921 if (isFastMultiplier) { 11922 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11923 if (NeedsCondInvert) // Invert the condition if needed. 11924 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11925 DAG.getConstant(1, Cond.getValueType())); 11926 11927 // Zero extend the condition if needed. 11928 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11929 Cond); 11930 // Scale the condition by the difference. 11931 if (Diff != 1) 11932 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11933 DAG.getConstant(Diff, Cond.getValueType())); 11934 11935 // Add the base if non-zero. 11936 if (FalseC->getAPIntValue() != 0) 11937 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11938 SDValue(FalseC, 0)); 11939 return Cond; 11940 } 11941 } 11942 } 11943 } 11944 11945 return SDValue(); 11946} 11947 11948/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 11949static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 11950 TargetLowering::DAGCombinerInfo &DCI) { 11951 DebugLoc DL = N->getDebugLoc(); 11952 11953 // If the flag operand isn't dead, don't touch this CMOV. 11954 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 11955 return SDValue(); 11956 11957 SDValue FalseOp = N->getOperand(0); 11958 SDValue TrueOp = N->getOperand(1); 11959 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 11960 SDValue Cond = N->getOperand(3); 11961 if (CC == X86::COND_E || CC == X86::COND_NE) { 11962 switch (Cond.getOpcode()) { 11963 default: break; 11964 case X86ISD::BSR: 11965 case X86ISD::BSF: 11966 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 11967 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 11968 return (CC == X86::COND_E) ? FalseOp : TrueOp; 11969 } 11970 } 11971 11972 // If this is a select between two integer constants, try to do some 11973 // optimizations. Note that the operands are ordered the opposite of SELECT 11974 // operands. 11975 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 11976 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 11977 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 11978 // larger than FalseC (the false value). 11979 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 11980 CC = X86::GetOppositeBranchCondition(CC); 11981 std::swap(TrueC, FalseC); 11982 } 11983 11984 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 11985 // This is efficient for any integer data type (including i8/i16) and 11986 // shift amount. 11987 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 11988 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11989 DAG.getConstant(CC, MVT::i8), Cond); 11990 11991 // Zero extend the condition if needed. 11992 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 11993 11994 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11995 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 11996 DAG.getConstant(ShAmt, MVT::i8)); 11997 if (N->getNumValues() == 2) // Dead flag value? 11998 return DCI.CombineTo(N, Cond, SDValue()); 11999 return Cond; 12000 } 12001 12002 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 12003 // for any integer data type, including i8/i16. 12004 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 12005 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12006 DAG.getConstant(CC, MVT::i8), Cond); 12007 12008 // Zero extend the condition if needed. 12009 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 12010 FalseC->getValueType(0), Cond); 12011 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12012 SDValue(FalseC, 0)); 12013 12014 if (N->getNumValues() == 2) // Dead flag value? 12015 return DCI.CombineTo(N, Cond, SDValue()); 12016 return Cond; 12017 } 12018 12019 // Optimize cases that will turn into an LEA instruction. This requires 12020 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 12021 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 12022 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 12023 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 12024 12025 bool isFastMultiplier = false; 12026 if (Diff < 10) { 12027 switch ((unsigned char)Diff) { 12028 default: break; 12029 case 1: // result = add base, cond 12030 case 2: // result = lea base( , cond*2) 12031 case 3: // result = lea base(cond, cond*2) 12032 case 4: // result = lea base( , cond*4) 12033 case 5: // result = lea base(cond, cond*4) 12034 case 8: // result = lea base( , cond*8) 12035 case 9: // result = lea base(cond, cond*8) 12036 isFastMultiplier = true; 12037 break; 12038 } 12039 } 12040 12041 if (isFastMultiplier) { 12042 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 12043 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12044 DAG.getConstant(CC, MVT::i8), Cond); 12045 // Zero extend the condition if needed. 12046 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 12047 Cond); 12048 // Scale the condition by the difference. 12049 if (Diff != 1) 12050 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 12051 DAG.getConstant(Diff, Cond.getValueType())); 12052 12053 // Add the base if non-zero. 12054 if (FalseC->getAPIntValue() != 0) 12055 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12056 SDValue(FalseC, 0)); 12057 if (N->getNumValues() == 2) // Dead flag value? 12058 return DCI.CombineTo(N, Cond, SDValue()); 12059 return Cond; 12060 } 12061 } 12062 } 12063 } 12064 return SDValue(); 12065} 12066 12067 12068/// PerformMulCombine - Optimize a single multiply with constant into two 12069/// in order to implement it with two cheaper instructions, e.g. 12070/// LEA + SHL, LEA + LEA. 12071static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 12072 TargetLowering::DAGCombinerInfo &DCI) { 12073 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12074 return SDValue(); 12075 12076 EVT VT = N->getValueType(0); 12077 if (VT != MVT::i64) 12078 return SDValue(); 12079 12080 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12081 if (!C) 12082 return SDValue(); 12083 uint64_t MulAmt = C->getZExtValue(); 12084 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 12085 return SDValue(); 12086 12087 uint64_t MulAmt1 = 0; 12088 uint64_t MulAmt2 = 0; 12089 if ((MulAmt % 9) == 0) { 12090 MulAmt1 = 9; 12091 MulAmt2 = MulAmt / 9; 12092 } else if ((MulAmt % 5) == 0) { 12093 MulAmt1 = 5; 12094 MulAmt2 = MulAmt / 5; 12095 } else if ((MulAmt % 3) == 0) { 12096 MulAmt1 = 3; 12097 MulAmt2 = MulAmt / 3; 12098 } 12099 if (MulAmt2 && 12100 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 12101 DebugLoc DL = N->getDebugLoc(); 12102 12103 if (isPowerOf2_64(MulAmt2) && 12104 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 12105 // If second multiplifer is pow2, issue it first. We want the multiply by 12106 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 12107 // is an add. 12108 std::swap(MulAmt1, MulAmt2); 12109 12110 SDValue NewMul; 12111 if (isPowerOf2_64(MulAmt1)) 12112 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 12113 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 12114 else 12115 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 12116 DAG.getConstant(MulAmt1, VT)); 12117 12118 if (isPowerOf2_64(MulAmt2)) 12119 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 12120 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 12121 else 12122 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 12123 DAG.getConstant(MulAmt2, VT)); 12124 12125 // Do not add new nodes to DAG combiner worklist. 12126 DCI.CombineTo(N, NewMul, false); 12127 } 12128 return SDValue(); 12129} 12130 12131static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 12132 SDValue N0 = N->getOperand(0); 12133 SDValue N1 = N->getOperand(1); 12134 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12135 EVT VT = N0.getValueType(); 12136 12137 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 12138 // since the result of setcc_c is all zero's or all ones. 12139 if (N1C && N0.getOpcode() == ISD::AND && 12140 N0.getOperand(1).getOpcode() == ISD::Constant) { 12141 SDValue N00 = N0.getOperand(0); 12142 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 12143 ((N00.getOpcode() == ISD::ANY_EXTEND || 12144 N00.getOpcode() == ISD::ZERO_EXTEND) && 12145 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 12146 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 12147 APInt ShAmt = N1C->getAPIntValue(); 12148 Mask = Mask.shl(ShAmt); 12149 if (Mask != 0) 12150 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 12151 N00, DAG.getConstant(Mask, VT)); 12152 } 12153 } 12154 12155 return SDValue(); 12156} 12157 12158/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 12159/// when possible. 12160static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 12161 const X86Subtarget *Subtarget) { 12162 EVT VT = N->getValueType(0); 12163 if (!VT.isVector() && VT.isInteger() && 12164 N->getOpcode() == ISD::SHL) 12165 return PerformSHLCombine(N, DAG); 12166 12167 // On X86 with SSE2 support, we can transform this to a vector shift if 12168 // all elements are shifted by the same amount. We can't do this in legalize 12169 // because the a constant vector is typically transformed to a constant pool 12170 // so we have no knowledge of the shift amount. 12171 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 12172 return SDValue(); 12173 12174 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 12175 return SDValue(); 12176 12177 SDValue ShAmtOp = N->getOperand(1); 12178 EVT EltVT = VT.getVectorElementType(); 12179 DebugLoc DL = N->getDebugLoc(); 12180 SDValue BaseShAmt = SDValue(); 12181 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 12182 unsigned NumElts = VT.getVectorNumElements(); 12183 unsigned i = 0; 12184 for (; i != NumElts; ++i) { 12185 SDValue Arg = ShAmtOp.getOperand(i); 12186 if (Arg.getOpcode() == ISD::UNDEF) continue; 12187 BaseShAmt = Arg; 12188 break; 12189 } 12190 for (; i != NumElts; ++i) { 12191 SDValue Arg = ShAmtOp.getOperand(i); 12192 if (Arg.getOpcode() == ISD::UNDEF) continue; 12193 if (Arg != BaseShAmt) { 12194 return SDValue(); 12195 } 12196 } 12197 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 12198 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 12199 SDValue InVec = ShAmtOp.getOperand(0); 12200 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12201 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12202 unsigned i = 0; 12203 for (; i != NumElts; ++i) { 12204 SDValue Arg = InVec.getOperand(i); 12205 if (Arg.getOpcode() == ISD::UNDEF) continue; 12206 BaseShAmt = Arg; 12207 break; 12208 } 12209 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12210 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12211 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 12212 if (C->getZExtValue() == SplatIdx) 12213 BaseShAmt = InVec.getOperand(1); 12214 } 12215 } 12216 if (BaseShAmt.getNode() == 0) 12217 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 12218 DAG.getIntPtrConstant(0)); 12219 } else 12220 return SDValue(); 12221 12222 // The shift amount is an i32. 12223 if (EltVT.bitsGT(MVT::i32)) 12224 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 12225 else if (EltVT.bitsLT(MVT::i32)) 12226 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 12227 12228 // The shift amount is identical so we can do a vector shift. 12229 SDValue ValOp = N->getOperand(0); 12230 switch (N->getOpcode()) { 12231 default: 12232 llvm_unreachable("Unknown shift opcode!"); 12233 break; 12234 case ISD::SHL: 12235 if (VT == MVT::v2i64) 12236 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12237 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 12238 ValOp, BaseShAmt); 12239 if (VT == MVT::v4i32) 12240 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12241 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 12242 ValOp, BaseShAmt); 12243 if (VT == MVT::v8i16) 12244 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12245 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 12246 ValOp, BaseShAmt); 12247 break; 12248 case ISD::SRA: 12249 if (VT == MVT::v4i32) 12250 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12251 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 12252 ValOp, BaseShAmt); 12253 if (VT == MVT::v8i16) 12254 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12255 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 12256 ValOp, BaseShAmt); 12257 break; 12258 case ISD::SRL: 12259 if (VT == MVT::v2i64) 12260 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12261 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 12262 ValOp, BaseShAmt); 12263 if (VT == MVT::v4i32) 12264 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12265 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 12266 ValOp, BaseShAmt); 12267 if (VT == MVT::v8i16) 12268 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12269 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 12270 ValOp, BaseShAmt); 12271 break; 12272 } 12273 return SDValue(); 12274} 12275 12276 12277// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 12278// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 12279// and friends. Likewise for OR -> CMPNEQSS. 12280static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 12281 TargetLowering::DAGCombinerInfo &DCI, 12282 const X86Subtarget *Subtarget) { 12283 unsigned opcode; 12284 12285 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 12286 // we're requiring SSE2 for both. 12287 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 12288 SDValue N0 = N->getOperand(0); 12289 SDValue N1 = N->getOperand(1); 12290 SDValue CMP0 = N0->getOperand(1); 12291 SDValue CMP1 = N1->getOperand(1); 12292 DebugLoc DL = N->getDebugLoc(); 12293 12294 // The SETCCs should both refer to the same CMP. 12295 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 12296 return SDValue(); 12297 12298 SDValue CMP00 = CMP0->getOperand(0); 12299 SDValue CMP01 = CMP0->getOperand(1); 12300 EVT VT = CMP00.getValueType(); 12301 12302 if (VT == MVT::f32 || VT == MVT::f64) { 12303 bool ExpectingFlags = false; 12304 // Check for any users that want flags: 12305 for (SDNode::use_iterator UI = N->use_begin(), 12306 UE = N->use_end(); 12307 !ExpectingFlags && UI != UE; ++UI) 12308 switch (UI->getOpcode()) { 12309 default: 12310 case ISD::BR_CC: 12311 case ISD::BRCOND: 12312 case ISD::SELECT: 12313 ExpectingFlags = true; 12314 break; 12315 case ISD::CopyToReg: 12316 case ISD::SIGN_EXTEND: 12317 case ISD::ZERO_EXTEND: 12318 case ISD::ANY_EXTEND: 12319 break; 12320 } 12321 12322 if (!ExpectingFlags) { 12323 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 12324 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 12325 12326 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 12327 X86::CondCode tmp = cc0; 12328 cc0 = cc1; 12329 cc1 = tmp; 12330 } 12331 12332 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 12333 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 12334 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 12335 X86ISD::NodeType NTOperator = is64BitFP ? 12336 X86ISD::FSETCCsd : X86ISD::FSETCCss; 12337 // FIXME: need symbolic constants for these magic numbers. 12338 // See X86ATTInstPrinter.cpp:printSSECC(). 12339 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 12340 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 12341 DAG.getConstant(x86cc, MVT::i8)); 12342 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 12343 OnesOrZeroesF); 12344 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 12345 DAG.getConstant(1, MVT::i32)); 12346 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 12347 return OneBitOfTruth; 12348 } 12349 } 12350 } 12351 } 12352 return SDValue(); 12353} 12354 12355/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 12356/// so it can be folded inside ANDNP. 12357static bool CanFoldXORWithAllOnes(const SDNode *N) { 12358 EVT VT = N->getValueType(0); 12359 12360 // Match direct AllOnes for 128 and 256-bit vectors 12361 if (ISD::isBuildVectorAllOnes(N)) 12362 return true; 12363 12364 // Look through a bit convert. 12365 if (N->getOpcode() == ISD::BITCAST) 12366 N = N->getOperand(0).getNode(); 12367 12368 // Sometimes the operand may come from a insert_subvector building a 256-bit 12369 // allones vector 12370 if (VT.getSizeInBits() == 256 && 12371 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 12372 SDValue V1 = N->getOperand(0); 12373 SDValue V2 = N->getOperand(1); 12374 12375 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 12376 V1.getOperand(0).getOpcode() == ISD::UNDEF && 12377 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 12378 ISD::isBuildVectorAllOnes(V2.getNode())) 12379 return true; 12380 } 12381 12382 return false; 12383} 12384 12385static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 12386 TargetLowering::DAGCombinerInfo &DCI, 12387 const X86Subtarget *Subtarget) { 12388 if (DCI.isBeforeLegalizeOps()) 12389 return SDValue(); 12390 12391 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12392 if (R.getNode()) 12393 return R; 12394 12395 // Want to form ANDNP nodes: 12396 // 1) In the hopes of then easily combining them with OR and AND nodes 12397 // to form PBLEND/PSIGN. 12398 // 2) To match ANDN packed intrinsics 12399 EVT VT = N->getValueType(0); 12400 if (VT != MVT::v2i64 && VT != MVT::v4i64) 12401 return SDValue(); 12402 12403 SDValue N0 = N->getOperand(0); 12404 SDValue N1 = N->getOperand(1); 12405 DebugLoc DL = N->getDebugLoc(); 12406 12407 // Check LHS for vnot 12408 if (N0.getOpcode() == ISD::XOR && 12409 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 12410 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 12411 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 12412 12413 // Check RHS for vnot 12414 if (N1.getOpcode() == ISD::XOR && 12415 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 12416 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 12417 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 12418 12419 return SDValue(); 12420} 12421 12422static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 12423 TargetLowering::DAGCombinerInfo &DCI, 12424 const X86Subtarget *Subtarget) { 12425 if (DCI.isBeforeLegalizeOps()) 12426 return SDValue(); 12427 12428 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12429 if (R.getNode()) 12430 return R; 12431 12432 EVT VT = N->getValueType(0); 12433 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 12434 return SDValue(); 12435 12436 SDValue N0 = N->getOperand(0); 12437 SDValue N1 = N->getOperand(1); 12438 12439 // look for psign/blend 12440 if (Subtarget->hasSSSE3()) { 12441 if (VT == MVT::v2i64) { 12442 // Canonicalize pandn to RHS 12443 if (N0.getOpcode() == X86ISD::ANDNP) 12444 std::swap(N0, N1); 12445 // or (and (m, x), (pandn m, y)) 12446 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 12447 SDValue Mask = N1.getOperand(0); 12448 SDValue X = N1.getOperand(1); 12449 SDValue Y; 12450 if (N0.getOperand(0) == Mask) 12451 Y = N0.getOperand(1); 12452 if (N0.getOperand(1) == Mask) 12453 Y = N0.getOperand(0); 12454 12455 // Check to see if the mask appeared in both the AND and ANDNP and 12456 if (!Y.getNode()) 12457 return SDValue(); 12458 12459 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 12460 if (Mask.getOpcode() != ISD::BITCAST || 12461 X.getOpcode() != ISD::BITCAST || 12462 Y.getOpcode() != ISD::BITCAST) 12463 return SDValue(); 12464 12465 // Look through mask bitcast. 12466 Mask = Mask.getOperand(0); 12467 EVT MaskVT = Mask.getValueType(); 12468 12469 // Validate that the Mask operand is a vector sra node. The sra node 12470 // will be an intrinsic. 12471 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 12472 return SDValue(); 12473 12474 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 12475 // there is no psrai.b 12476 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 12477 case Intrinsic::x86_sse2_psrai_w: 12478 case Intrinsic::x86_sse2_psrai_d: 12479 break; 12480 default: return SDValue(); 12481 } 12482 12483 // Check that the SRA is all signbits. 12484 SDValue SraC = Mask.getOperand(2); 12485 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 12486 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 12487 if ((SraAmt + 1) != EltBits) 12488 return SDValue(); 12489 12490 DebugLoc DL = N->getDebugLoc(); 12491 12492 // Now we know we at least have a plendvb with the mask val. See if 12493 // we can form a psignb/w/d. 12494 // psign = x.type == y.type == mask.type && y = sub(0, x); 12495 X = X.getOperand(0); 12496 Y = Y.getOperand(0); 12497 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 12498 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 12499 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 12500 unsigned Opc = 0; 12501 switch (EltBits) { 12502 case 8: Opc = X86ISD::PSIGNB; break; 12503 case 16: Opc = X86ISD::PSIGNW; break; 12504 case 32: Opc = X86ISD::PSIGND; break; 12505 default: break; 12506 } 12507 if (Opc) { 12508 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 12509 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 12510 } 12511 } 12512 // PBLENDVB only available on SSE 4.1 12513 if (!Subtarget->hasSSE41()) 12514 return SDValue(); 12515 12516 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 12517 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 12518 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 12519 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 12520 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 12521 } 12522 } 12523 } 12524 12525 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 12526 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 12527 std::swap(N0, N1); 12528 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 12529 return SDValue(); 12530 if (!N0.hasOneUse() || !N1.hasOneUse()) 12531 return SDValue(); 12532 12533 SDValue ShAmt0 = N0.getOperand(1); 12534 if (ShAmt0.getValueType() != MVT::i8) 12535 return SDValue(); 12536 SDValue ShAmt1 = N1.getOperand(1); 12537 if (ShAmt1.getValueType() != MVT::i8) 12538 return SDValue(); 12539 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 12540 ShAmt0 = ShAmt0.getOperand(0); 12541 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 12542 ShAmt1 = ShAmt1.getOperand(0); 12543 12544 DebugLoc DL = N->getDebugLoc(); 12545 unsigned Opc = X86ISD::SHLD; 12546 SDValue Op0 = N0.getOperand(0); 12547 SDValue Op1 = N1.getOperand(0); 12548 if (ShAmt0.getOpcode() == ISD::SUB) { 12549 Opc = X86ISD::SHRD; 12550 std::swap(Op0, Op1); 12551 std::swap(ShAmt0, ShAmt1); 12552 } 12553 12554 unsigned Bits = VT.getSizeInBits(); 12555 if (ShAmt1.getOpcode() == ISD::SUB) { 12556 SDValue Sum = ShAmt1.getOperand(0); 12557 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 12558 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 12559 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 12560 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 12561 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 12562 return DAG.getNode(Opc, DL, VT, 12563 Op0, Op1, 12564 DAG.getNode(ISD::TRUNCATE, DL, 12565 MVT::i8, ShAmt0)); 12566 } 12567 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 12568 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 12569 if (ShAmt0C && 12570 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 12571 return DAG.getNode(Opc, DL, VT, 12572 N0.getOperand(0), N1.getOperand(0), 12573 DAG.getNode(ISD::TRUNCATE, DL, 12574 MVT::i8, ShAmt0)); 12575 } 12576 12577 return SDValue(); 12578} 12579 12580/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 12581static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 12582 const X86Subtarget *Subtarget) { 12583 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 12584 // the FP state in cases where an emms may be missing. 12585 // A preferable solution to the general problem is to figure out the right 12586 // places to insert EMMS. This qualifies as a quick hack. 12587 12588 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 12589 StoreSDNode *St = cast<StoreSDNode>(N); 12590 EVT VT = St->getValue().getValueType(); 12591 if (VT.getSizeInBits() != 64) 12592 return SDValue(); 12593 12594 const Function *F = DAG.getMachineFunction().getFunction(); 12595 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 12596 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 12597 && Subtarget->hasSSE2(); 12598 if ((VT.isVector() || 12599 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 12600 isa<LoadSDNode>(St->getValue()) && 12601 !cast<LoadSDNode>(St->getValue())->isVolatile() && 12602 St->getChain().hasOneUse() && !St->isVolatile()) { 12603 SDNode* LdVal = St->getValue().getNode(); 12604 LoadSDNode *Ld = 0; 12605 int TokenFactorIndex = -1; 12606 SmallVector<SDValue, 8> Ops; 12607 SDNode* ChainVal = St->getChain().getNode(); 12608 // Must be a store of a load. We currently handle two cases: the load 12609 // is a direct child, and it's under an intervening TokenFactor. It is 12610 // possible to dig deeper under nested TokenFactors. 12611 if (ChainVal == LdVal) 12612 Ld = cast<LoadSDNode>(St->getChain()); 12613 else if (St->getValue().hasOneUse() && 12614 ChainVal->getOpcode() == ISD::TokenFactor) { 12615 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 12616 if (ChainVal->getOperand(i).getNode() == LdVal) { 12617 TokenFactorIndex = i; 12618 Ld = cast<LoadSDNode>(St->getValue()); 12619 } else 12620 Ops.push_back(ChainVal->getOperand(i)); 12621 } 12622 } 12623 12624 if (!Ld || !ISD::isNormalLoad(Ld)) 12625 return SDValue(); 12626 12627 // If this is not the MMX case, i.e. we are just turning i64 load/store 12628 // into f64 load/store, avoid the transformation if there are multiple 12629 // uses of the loaded value. 12630 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 12631 return SDValue(); 12632 12633 DebugLoc LdDL = Ld->getDebugLoc(); 12634 DebugLoc StDL = N->getDebugLoc(); 12635 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 12636 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 12637 // pair instead. 12638 if (Subtarget->is64Bit() || F64IsLegal) { 12639 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 12640 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 12641 Ld->getPointerInfo(), Ld->isVolatile(), 12642 Ld->isNonTemporal(), Ld->getAlignment()); 12643 SDValue NewChain = NewLd.getValue(1); 12644 if (TokenFactorIndex != -1) { 12645 Ops.push_back(NewChain); 12646 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12647 Ops.size()); 12648 } 12649 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 12650 St->getPointerInfo(), 12651 St->isVolatile(), St->isNonTemporal(), 12652 St->getAlignment()); 12653 } 12654 12655 // Otherwise, lower to two pairs of 32-bit loads / stores. 12656 SDValue LoAddr = Ld->getBasePtr(); 12657 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 12658 DAG.getConstant(4, MVT::i32)); 12659 12660 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 12661 Ld->getPointerInfo(), 12662 Ld->isVolatile(), Ld->isNonTemporal(), 12663 Ld->getAlignment()); 12664 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 12665 Ld->getPointerInfo().getWithOffset(4), 12666 Ld->isVolatile(), Ld->isNonTemporal(), 12667 MinAlign(Ld->getAlignment(), 4)); 12668 12669 SDValue NewChain = LoLd.getValue(1); 12670 if (TokenFactorIndex != -1) { 12671 Ops.push_back(LoLd); 12672 Ops.push_back(HiLd); 12673 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12674 Ops.size()); 12675 } 12676 12677 LoAddr = St->getBasePtr(); 12678 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 12679 DAG.getConstant(4, MVT::i32)); 12680 12681 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 12682 St->getPointerInfo(), 12683 St->isVolatile(), St->isNonTemporal(), 12684 St->getAlignment()); 12685 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 12686 St->getPointerInfo().getWithOffset(4), 12687 St->isVolatile(), 12688 St->isNonTemporal(), 12689 MinAlign(St->getAlignment(), 4)); 12690 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 12691 } 12692 return SDValue(); 12693} 12694 12695/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 12696/// X86ISD::FXOR nodes. 12697static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 12698 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 12699 // F[X]OR(0.0, x) -> x 12700 // F[X]OR(x, 0.0) -> x 12701 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12702 if (C->getValueAPF().isPosZero()) 12703 return N->getOperand(1); 12704 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12705 if (C->getValueAPF().isPosZero()) 12706 return N->getOperand(0); 12707 return SDValue(); 12708} 12709 12710/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 12711static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 12712 // FAND(0.0, x) -> 0.0 12713 // FAND(x, 0.0) -> 0.0 12714 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12715 if (C->getValueAPF().isPosZero()) 12716 return N->getOperand(0); 12717 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12718 if (C->getValueAPF().isPosZero()) 12719 return N->getOperand(1); 12720 return SDValue(); 12721} 12722 12723static SDValue PerformBTCombine(SDNode *N, 12724 SelectionDAG &DAG, 12725 TargetLowering::DAGCombinerInfo &DCI) { 12726 // BT ignores high bits in the bit index operand. 12727 SDValue Op1 = N->getOperand(1); 12728 if (Op1.hasOneUse()) { 12729 unsigned BitWidth = Op1.getValueSizeInBits(); 12730 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 12731 APInt KnownZero, KnownOne; 12732 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 12733 !DCI.isBeforeLegalizeOps()); 12734 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12735 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 12736 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 12737 DCI.CommitTargetLoweringOpt(TLO); 12738 } 12739 return SDValue(); 12740} 12741 12742static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 12743 SDValue Op = N->getOperand(0); 12744 if (Op.getOpcode() == ISD::BITCAST) 12745 Op = Op.getOperand(0); 12746 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 12747 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 12748 VT.getVectorElementType().getSizeInBits() == 12749 OpVT.getVectorElementType().getSizeInBits()) { 12750 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 12751 } 12752 return SDValue(); 12753} 12754 12755static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 12756 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 12757 // (and (i32 x86isd::setcc_carry), 1) 12758 // This eliminates the zext. This transformation is necessary because 12759 // ISD::SETCC is always legalized to i8. 12760 DebugLoc dl = N->getDebugLoc(); 12761 SDValue N0 = N->getOperand(0); 12762 EVT VT = N->getValueType(0); 12763 if (N0.getOpcode() == ISD::AND && 12764 N0.hasOneUse() && 12765 N0.getOperand(0).hasOneUse()) { 12766 SDValue N00 = N0.getOperand(0); 12767 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 12768 return SDValue(); 12769 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 12770 if (!C || C->getZExtValue() != 1) 12771 return SDValue(); 12772 return DAG.getNode(ISD::AND, dl, VT, 12773 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 12774 N00.getOperand(0), N00.getOperand(1)), 12775 DAG.getConstant(1, VT)); 12776 } 12777 12778 return SDValue(); 12779} 12780 12781// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 12782static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 12783 unsigned X86CC = N->getConstantOperandVal(0); 12784 SDValue EFLAG = N->getOperand(1); 12785 DebugLoc DL = N->getDebugLoc(); 12786 12787 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 12788 // a zext and produces an all-ones bit which is more useful than 0/1 in some 12789 // cases. 12790 if (X86CC == X86::COND_B) 12791 return DAG.getNode(ISD::AND, DL, MVT::i8, 12792 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 12793 DAG.getConstant(X86CC, MVT::i8), EFLAG), 12794 DAG.getConstant(1, MVT::i8)); 12795 12796 return SDValue(); 12797} 12798 12799static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 12800 const X86TargetLowering *XTLI) { 12801 SDValue Op0 = N->getOperand(0); 12802 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 12803 // a 32-bit target where SSE doesn't support i64->FP operations. 12804 if (Op0.getOpcode() == ISD::LOAD) { 12805 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 12806 EVT VT = Ld->getValueType(0); 12807 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 12808 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 12809 !XTLI->getSubtarget()->is64Bit() && 12810 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12811 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 12812 Ld->getChain(), Op0, DAG); 12813 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 12814 return FILDChain; 12815 } 12816 } 12817 return SDValue(); 12818} 12819 12820// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 12821static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 12822 X86TargetLowering::DAGCombinerInfo &DCI) { 12823 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 12824 // the result is either zero or one (depending on the input carry bit). 12825 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 12826 if (X86::isZeroNode(N->getOperand(0)) && 12827 X86::isZeroNode(N->getOperand(1)) && 12828 // We don't have a good way to replace an EFLAGS use, so only do this when 12829 // dead right now. 12830 SDValue(N, 1).use_empty()) { 12831 DebugLoc DL = N->getDebugLoc(); 12832 EVT VT = N->getValueType(0); 12833 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 12834 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 12835 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 12836 DAG.getConstant(X86::COND_B,MVT::i8), 12837 N->getOperand(2)), 12838 DAG.getConstant(1, VT)); 12839 return DCI.CombineTo(N, Res1, CarryOut); 12840 } 12841 12842 return SDValue(); 12843} 12844 12845// fold (add Y, (sete X, 0)) -> adc 0, Y 12846// (add Y, (setne X, 0)) -> sbb -1, Y 12847// (sub (sete X, 0), Y) -> sbb 0, Y 12848// (sub (setne X, 0), Y) -> adc -1, Y 12849static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 12850 DebugLoc DL = N->getDebugLoc(); 12851 12852 // Look through ZExts. 12853 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 12854 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 12855 return SDValue(); 12856 12857 SDValue SetCC = Ext.getOperand(0); 12858 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 12859 return SDValue(); 12860 12861 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 12862 if (CC != X86::COND_E && CC != X86::COND_NE) 12863 return SDValue(); 12864 12865 SDValue Cmp = SetCC.getOperand(1); 12866 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 12867 !X86::isZeroNode(Cmp.getOperand(1)) || 12868 !Cmp.getOperand(0).getValueType().isInteger()) 12869 return SDValue(); 12870 12871 SDValue CmpOp0 = Cmp.getOperand(0); 12872 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 12873 DAG.getConstant(1, CmpOp0.getValueType())); 12874 12875 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 12876 if (CC == X86::COND_NE) 12877 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 12878 DL, OtherVal.getValueType(), OtherVal, 12879 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 12880 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 12881 DL, OtherVal.getValueType(), OtherVal, 12882 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 12883} 12884 12885static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { 12886 SDValue Op0 = N->getOperand(0); 12887 SDValue Op1 = N->getOperand(1); 12888 12889 // X86 can't encode an immediate LHS of a sub. See if we can push the 12890 // negation into a preceding instruction. 12891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 12892 uint64_t Op0C = C->getSExtValue(); 12893 12894 // If the RHS of the sub is a XOR with one use and a constant, invert the 12895 // immediate. Then add one to the LHS of the sub so we can turn 12896 // X-Y -> X+~Y+1, saving one register. 12897 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 12898 isa<ConstantSDNode>(Op1.getOperand(1))) { 12899 uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue(); 12900 EVT VT = Op0.getValueType(); 12901 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 12902 Op1.getOperand(0), 12903 DAG.getConstant(~XorC, VT)); 12904 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 12905 DAG.getConstant(Op0C+1, VT)); 12906 } 12907 } 12908 12909 return OptimizeConditionalInDecrement(N, DAG); 12910} 12911 12912SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 12913 DAGCombinerInfo &DCI) const { 12914 SelectionDAG &DAG = DCI.DAG; 12915 switch (N->getOpcode()) { 12916 default: break; 12917 case ISD::EXTRACT_VECTOR_ELT: 12918 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 12919 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 12920 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 12921 case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); 12922 case ISD::SUB: return PerformSubCombine(N, DAG); 12923 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 12924 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 12925 case ISD::SHL: 12926 case ISD::SRA: 12927 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 12928 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 12929 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 12930 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 12931 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 12932 case X86ISD::FXOR: 12933 case X86ISD::FOR: return PerformFORCombine(N, DAG); 12934 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 12935 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 12936 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 12937 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 12938 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 12939 case X86ISD::SHUFPS: // Handle all target specific shuffles 12940 case X86ISD::SHUFPD: 12941 case X86ISD::PALIGN: 12942 case X86ISD::PUNPCKHBW: 12943 case X86ISD::PUNPCKHWD: 12944 case X86ISD::PUNPCKHDQ: 12945 case X86ISD::PUNPCKHQDQ: 12946 case X86ISD::UNPCKHPS: 12947 case X86ISD::UNPCKHPD: 12948 case X86ISD::VUNPCKHPSY: 12949 case X86ISD::VUNPCKHPDY: 12950 case X86ISD::PUNPCKLBW: 12951 case X86ISD::PUNPCKLWD: 12952 case X86ISD::PUNPCKLDQ: 12953 case X86ISD::PUNPCKLQDQ: 12954 case X86ISD::UNPCKLPS: 12955 case X86ISD::UNPCKLPD: 12956 case X86ISD::VUNPCKLPSY: 12957 case X86ISD::VUNPCKLPDY: 12958 case X86ISD::MOVHLPS: 12959 case X86ISD::MOVLHPS: 12960 case X86ISD::PSHUFD: 12961 case X86ISD::PSHUFHW: 12962 case X86ISD::PSHUFLW: 12963 case X86ISD::MOVSS: 12964 case X86ISD::MOVSD: 12965 case X86ISD::VPERMILPS: 12966 case X86ISD::VPERMILPSY: 12967 case X86ISD::VPERMILPD: 12968 case X86ISD::VPERMILPDY: 12969 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 12970 } 12971 12972 return SDValue(); 12973} 12974 12975/// isTypeDesirableForOp - Return true if the target has native support for 12976/// the specified value type and it is 'desirable' to use the type for the 12977/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 12978/// instruction encodings are longer and some i16 instructions are slow. 12979bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 12980 if (!isTypeLegal(VT)) 12981 return false; 12982 if (VT != MVT::i16) 12983 return true; 12984 12985 switch (Opc) { 12986 default: 12987 return true; 12988 case ISD::LOAD: 12989 case ISD::SIGN_EXTEND: 12990 case ISD::ZERO_EXTEND: 12991 case ISD::ANY_EXTEND: 12992 case ISD::SHL: 12993 case ISD::SRL: 12994 case ISD::SUB: 12995 case ISD::ADD: 12996 case ISD::MUL: 12997 case ISD::AND: 12998 case ISD::OR: 12999 case ISD::XOR: 13000 return false; 13001 } 13002} 13003 13004/// IsDesirableToPromoteOp - This method query the target whether it is 13005/// beneficial for dag combiner to promote the specified node. If true, it 13006/// should return the desired promotion type by reference. 13007bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 13008 EVT VT = Op.getValueType(); 13009 if (VT != MVT::i16) 13010 return false; 13011 13012 bool Promote = false; 13013 bool Commute = false; 13014 switch (Op.getOpcode()) { 13015 default: break; 13016 case ISD::LOAD: { 13017 LoadSDNode *LD = cast<LoadSDNode>(Op); 13018 // If the non-extending load has a single use and it's not live out, then it 13019 // might be folded. 13020 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 13021 Op.hasOneUse()*/) { 13022 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 13023 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 13024 // The only case where we'd want to promote LOAD (rather then it being 13025 // promoted as an operand is when it's only use is liveout. 13026 if (UI->getOpcode() != ISD::CopyToReg) 13027 return false; 13028 } 13029 } 13030 Promote = true; 13031 break; 13032 } 13033 case ISD::SIGN_EXTEND: 13034 case ISD::ZERO_EXTEND: 13035 case ISD::ANY_EXTEND: 13036 Promote = true; 13037 break; 13038 case ISD::SHL: 13039 case ISD::SRL: { 13040 SDValue N0 = Op.getOperand(0); 13041 // Look out for (store (shl (load), x)). 13042 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 13043 return false; 13044 Promote = true; 13045 break; 13046 } 13047 case ISD::ADD: 13048 case ISD::MUL: 13049 case ISD::AND: 13050 case ISD::OR: 13051 case ISD::XOR: 13052 Commute = true; 13053 // fallthrough 13054 case ISD::SUB: { 13055 SDValue N0 = Op.getOperand(0); 13056 SDValue N1 = Op.getOperand(1); 13057 if (!Commute && MayFoldLoad(N1)) 13058 return false; 13059 // Avoid disabling potential load folding opportunities. 13060 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 13061 return false; 13062 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 13063 return false; 13064 Promote = true; 13065 } 13066 } 13067 13068 PVT = MVT::i32; 13069 return Promote; 13070} 13071 13072//===----------------------------------------------------------------------===// 13073// X86 Inline Assembly Support 13074//===----------------------------------------------------------------------===// 13075 13076bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 13077 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13078 13079 std::string AsmStr = IA->getAsmString(); 13080 13081 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 13082 SmallVector<StringRef, 4> AsmPieces; 13083 SplitString(AsmStr, AsmPieces, ";\n"); 13084 13085 switch (AsmPieces.size()) { 13086 default: return false; 13087 case 1: 13088 AsmStr = AsmPieces[0]; 13089 AsmPieces.clear(); 13090 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 13091 13092 // FIXME: this should verify that we are targeting a 486 or better. If not, 13093 // we will turn this bswap into something that will be lowered to logical ops 13094 // instead of emitting the bswap asm. For now, we don't support 486 or lower 13095 // so don't worry about this. 13096 // bswap $0 13097 if (AsmPieces.size() == 2 && 13098 (AsmPieces[0] == "bswap" || 13099 AsmPieces[0] == "bswapq" || 13100 AsmPieces[0] == "bswapl") && 13101 (AsmPieces[1] == "$0" || 13102 AsmPieces[1] == "${0:q}")) { 13103 // No need to check constraints, nothing other than the equivalent of 13104 // "=r,0" would be valid here. 13105 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13106 if (!Ty || Ty->getBitWidth() % 16 != 0) 13107 return false; 13108 return IntrinsicLowering::LowerToByteSwap(CI); 13109 } 13110 // rorw $$8, ${0:w} --> llvm.bswap.i16 13111 if (CI->getType()->isIntegerTy(16) && 13112 AsmPieces.size() == 3 && 13113 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 13114 AsmPieces[1] == "$$8," && 13115 AsmPieces[2] == "${0:w}" && 13116 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13117 AsmPieces.clear(); 13118 const std::string &ConstraintsStr = IA->getConstraintString(); 13119 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13120 std::sort(AsmPieces.begin(), AsmPieces.end()); 13121 if (AsmPieces.size() == 4 && 13122 AsmPieces[0] == "~{cc}" && 13123 AsmPieces[1] == "~{dirflag}" && 13124 AsmPieces[2] == "~{flags}" && 13125 AsmPieces[3] == "~{fpsr}") { 13126 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13127 if (!Ty || Ty->getBitWidth() % 16 != 0) 13128 return false; 13129 return IntrinsicLowering::LowerToByteSwap(CI); 13130 } 13131 } 13132 break; 13133 case 3: 13134 if (CI->getType()->isIntegerTy(32) && 13135 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13136 SmallVector<StringRef, 4> Words; 13137 SplitString(AsmPieces[0], Words, " \t,"); 13138 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13139 Words[2] == "${0:w}") { 13140 Words.clear(); 13141 SplitString(AsmPieces[1], Words, " \t,"); 13142 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 13143 Words[2] == "$0") { 13144 Words.clear(); 13145 SplitString(AsmPieces[2], Words, " \t,"); 13146 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13147 Words[2] == "${0:w}") { 13148 AsmPieces.clear(); 13149 const std::string &ConstraintsStr = IA->getConstraintString(); 13150 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13151 std::sort(AsmPieces.begin(), AsmPieces.end()); 13152 if (AsmPieces.size() == 4 && 13153 AsmPieces[0] == "~{cc}" && 13154 AsmPieces[1] == "~{dirflag}" && 13155 AsmPieces[2] == "~{flags}" && 13156 AsmPieces[3] == "~{fpsr}") { 13157 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13158 if (!Ty || Ty->getBitWidth() % 16 != 0) 13159 return false; 13160 return IntrinsicLowering::LowerToByteSwap(CI); 13161 } 13162 } 13163 } 13164 } 13165 } 13166 13167 if (CI->getType()->isIntegerTy(64)) { 13168 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 13169 if (Constraints.size() >= 2 && 13170 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 13171 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 13172 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 13173 SmallVector<StringRef, 4> Words; 13174 SplitString(AsmPieces[0], Words, " \t"); 13175 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 13176 Words.clear(); 13177 SplitString(AsmPieces[1], Words, " \t"); 13178 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 13179 Words.clear(); 13180 SplitString(AsmPieces[2], Words, " \t,"); 13181 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 13182 Words[2] == "%edx") { 13183 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13184 if (!Ty || Ty->getBitWidth() % 16 != 0) 13185 return false; 13186 return IntrinsicLowering::LowerToByteSwap(CI); 13187 } 13188 } 13189 } 13190 } 13191 } 13192 break; 13193 } 13194 return false; 13195} 13196 13197 13198 13199/// getConstraintType - Given a constraint letter, return the type of 13200/// constraint it is for this target. 13201X86TargetLowering::ConstraintType 13202X86TargetLowering::getConstraintType(const std::string &Constraint) const { 13203 if (Constraint.size() == 1) { 13204 switch (Constraint[0]) { 13205 case 'R': 13206 case 'q': 13207 case 'Q': 13208 case 'f': 13209 case 't': 13210 case 'u': 13211 case 'y': 13212 case 'x': 13213 case 'Y': 13214 case 'l': 13215 return C_RegisterClass; 13216 case 'a': 13217 case 'b': 13218 case 'c': 13219 case 'd': 13220 case 'S': 13221 case 'D': 13222 case 'A': 13223 return C_Register; 13224 case 'I': 13225 case 'J': 13226 case 'K': 13227 case 'L': 13228 case 'M': 13229 case 'N': 13230 case 'G': 13231 case 'C': 13232 case 'e': 13233 case 'Z': 13234 return C_Other; 13235 default: 13236 break; 13237 } 13238 } 13239 return TargetLowering::getConstraintType(Constraint); 13240} 13241 13242/// Examine constraint type and operand type and determine a weight value. 13243/// This object must already have been set up with the operand type 13244/// and the current alternative constraint selected. 13245TargetLowering::ConstraintWeight 13246 X86TargetLowering::getSingleConstraintMatchWeight( 13247 AsmOperandInfo &info, const char *constraint) const { 13248 ConstraintWeight weight = CW_Invalid; 13249 Value *CallOperandVal = info.CallOperandVal; 13250 // If we don't have a value, we can't do a match, 13251 // but allow it at the lowest weight. 13252 if (CallOperandVal == NULL) 13253 return CW_Default; 13254 Type *type = CallOperandVal->getType(); 13255 // Look at the constraint type. 13256 switch (*constraint) { 13257 default: 13258 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13259 case 'R': 13260 case 'q': 13261 case 'Q': 13262 case 'a': 13263 case 'b': 13264 case 'c': 13265 case 'd': 13266 case 'S': 13267 case 'D': 13268 case 'A': 13269 if (CallOperandVal->getType()->isIntegerTy()) 13270 weight = CW_SpecificReg; 13271 break; 13272 case 'f': 13273 case 't': 13274 case 'u': 13275 if (type->isFloatingPointTy()) 13276 weight = CW_SpecificReg; 13277 break; 13278 case 'y': 13279 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 13280 weight = CW_SpecificReg; 13281 break; 13282 case 'x': 13283 case 'Y': 13284 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 13285 weight = CW_Register; 13286 break; 13287 case 'I': 13288 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 13289 if (C->getZExtValue() <= 31) 13290 weight = CW_Constant; 13291 } 13292 break; 13293 case 'J': 13294 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13295 if (C->getZExtValue() <= 63) 13296 weight = CW_Constant; 13297 } 13298 break; 13299 case 'K': 13300 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13301 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 13302 weight = CW_Constant; 13303 } 13304 break; 13305 case 'L': 13306 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13307 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 13308 weight = CW_Constant; 13309 } 13310 break; 13311 case 'M': 13312 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13313 if (C->getZExtValue() <= 3) 13314 weight = CW_Constant; 13315 } 13316 break; 13317 case 'N': 13318 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13319 if (C->getZExtValue() <= 0xff) 13320 weight = CW_Constant; 13321 } 13322 break; 13323 case 'G': 13324 case 'C': 13325 if (dyn_cast<ConstantFP>(CallOperandVal)) { 13326 weight = CW_Constant; 13327 } 13328 break; 13329 case 'e': 13330 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13331 if ((C->getSExtValue() >= -0x80000000LL) && 13332 (C->getSExtValue() <= 0x7fffffffLL)) 13333 weight = CW_Constant; 13334 } 13335 break; 13336 case 'Z': 13337 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13338 if (C->getZExtValue() <= 0xffffffff) 13339 weight = CW_Constant; 13340 } 13341 break; 13342 } 13343 return weight; 13344} 13345 13346/// LowerXConstraint - try to replace an X constraint, which matches anything, 13347/// with another that has more specific requirements based on the type of the 13348/// corresponding operand. 13349const char *X86TargetLowering:: 13350LowerXConstraint(EVT ConstraintVT) const { 13351 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 13352 // 'f' like normal targets. 13353 if (ConstraintVT.isFloatingPoint()) { 13354 if (Subtarget->hasXMMInt()) 13355 return "Y"; 13356 if (Subtarget->hasXMM()) 13357 return "x"; 13358 } 13359 13360 return TargetLowering::LowerXConstraint(ConstraintVT); 13361} 13362 13363/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13364/// vector. If it is invalid, don't add anything to Ops. 13365void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13366 std::string &Constraint, 13367 std::vector<SDValue>&Ops, 13368 SelectionDAG &DAG) const { 13369 SDValue Result(0, 0); 13370 13371 // Only support length 1 constraints for now. 13372 if (Constraint.length() > 1) return; 13373 13374 char ConstraintLetter = Constraint[0]; 13375 switch (ConstraintLetter) { 13376 default: break; 13377 case 'I': 13378 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13379 if (C->getZExtValue() <= 31) { 13380 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13381 break; 13382 } 13383 } 13384 return; 13385 case 'J': 13386 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13387 if (C->getZExtValue() <= 63) { 13388 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13389 break; 13390 } 13391 } 13392 return; 13393 case 'K': 13394 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13395 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 13396 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13397 break; 13398 } 13399 } 13400 return; 13401 case 'N': 13402 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13403 if (C->getZExtValue() <= 255) { 13404 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13405 break; 13406 } 13407 } 13408 return; 13409 case 'e': { 13410 // 32-bit signed value 13411 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13412 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13413 C->getSExtValue())) { 13414 // Widen to 64 bits here to get it sign extended. 13415 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 13416 break; 13417 } 13418 // FIXME gcc accepts some relocatable values here too, but only in certain 13419 // memory models; it's complicated. 13420 } 13421 return; 13422 } 13423 case 'Z': { 13424 // 32-bit unsigned value 13425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13426 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13427 C->getZExtValue())) { 13428 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13429 break; 13430 } 13431 } 13432 // FIXME gcc accepts some relocatable values here too, but only in certain 13433 // memory models; it's complicated. 13434 return; 13435 } 13436 case 'i': { 13437 // Literal immediates are always ok. 13438 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 13439 // Widen to 64 bits here to get it sign extended. 13440 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 13441 break; 13442 } 13443 13444 // In any sort of PIC mode addresses need to be computed at runtime by 13445 // adding in a register or some sort of table lookup. These can't 13446 // be used as immediates. 13447 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 13448 return; 13449 13450 // If we are in non-pic codegen mode, we allow the address of a global (with 13451 // an optional displacement) to be used with 'i'. 13452 GlobalAddressSDNode *GA = 0; 13453 int64_t Offset = 0; 13454 13455 // Match either (GA), (GA+C), (GA+C1+C2), etc. 13456 while (1) { 13457 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 13458 Offset += GA->getOffset(); 13459 break; 13460 } else if (Op.getOpcode() == ISD::ADD) { 13461 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13462 Offset += C->getZExtValue(); 13463 Op = Op.getOperand(0); 13464 continue; 13465 } 13466 } else if (Op.getOpcode() == ISD::SUB) { 13467 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13468 Offset += -C->getZExtValue(); 13469 Op = Op.getOperand(0); 13470 continue; 13471 } 13472 } 13473 13474 // Otherwise, this isn't something we can handle, reject it. 13475 return; 13476 } 13477 13478 const GlobalValue *GV = GA->getGlobal(); 13479 // If we require an extra load to get this address, as in PIC mode, we 13480 // can't accept it. 13481 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 13482 getTargetMachine()))) 13483 return; 13484 13485 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 13486 GA->getValueType(0), Offset); 13487 break; 13488 } 13489 } 13490 13491 if (Result.getNode()) { 13492 Ops.push_back(Result); 13493 return; 13494 } 13495 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13496} 13497 13498std::pair<unsigned, const TargetRegisterClass*> 13499X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 13500 EVT VT) const { 13501 // First, see if this is a constraint that directly corresponds to an LLVM 13502 // register class. 13503 if (Constraint.size() == 1) { 13504 // GCC Constraint Letters 13505 switch (Constraint[0]) { 13506 default: break; 13507 // TODO: Slight differences here in allocation order and leaving 13508 // RIP in the class. Do they matter any more here than they do 13509 // in the normal allocation? 13510 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 13511 if (Subtarget->is64Bit()) { 13512 if (VT == MVT::i32 || VT == MVT::f32) 13513 return std::make_pair(0U, X86::GR32RegisterClass); 13514 else if (VT == MVT::i16) 13515 return std::make_pair(0U, X86::GR16RegisterClass); 13516 else if (VT == MVT::i8 || VT == MVT::i1) 13517 return std::make_pair(0U, X86::GR8RegisterClass); 13518 else if (VT == MVT::i64 || VT == MVT::f64) 13519 return std::make_pair(0U, X86::GR64RegisterClass); 13520 break; 13521 } 13522 // 32-bit fallthrough 13523 case 'Q': // Q_REGS 13524 if (VT == MVT::i32 || VT == MVT::f32) 13525 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 13526 else if (VT == MVT::i16) 13527 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 13528 else if (VT == MVT::i8 || VT == MVT::i1) 13529 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 13530 else if (VT == MVT::i64) 13531 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 13532 break; 13533 case 'r': // GENERAL_REGS 13534 case 'l': // INDEX_REGS 13535 if (VT == MVT::i8 || VT == MVT::i1) 13536 return std::make_pair(0U, X86::GR8RegisterClass); 13537 if (VT == MVT::i16) 13538 return std::make_pair(0U, X86::GR16RegisterClass); 13539 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 13540 return std::make_pair(0U, X86::GR32RegisterClass); 13541 return std::make_pair(0U, X86::GR64RegisterClass); 13542 case 'R': // LEGACY_REGS 13543 if (VT == MVT::i8 || VT == MVT::i1) 13544 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 13545 if (VT == MVT::i16) 13546 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 13547 if (VT == MVT::i32 || !Subtarget->is64Bit()) 13548 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 13549 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 13550 case 'f': // FP Stack registers. 13551 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 13552 // value to the correct fpstack register class. 13553 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 13554 return std::make_pair(0U, X86::RFP32RegisterClass); 13555 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 13556 return std::make_pair(0U, X86::RFP64RegisterClass); 13557 return std::make_pair(0U, X86::RFP80RegisterClass); 13558 case 'y': // MMX_REGS if MMX allowed. 13559 if (!Subtarget->hasMMX()) break; 13560 return std::make_pair(0U, X86::VR64RegisterClass); 13561 case 'Y': // SSE_REGS if SSE2 allowed 13562 if (!Subtarget->hasXMMInt()) break; 13563 // FALL THROUGH. 13564 case 'x': // SSE_REGS if SSE1 allowed 13565 if (!Subtarget->hasXMM()) break; 13566 13567 switch (VT.getSimpleVT().SimpleTy) { 13568 default: break; 13569 // Scalar SSE types. 13570 case MVT::f32: 13571 case MVT::i32: 13572 return std::make_pair(0U, X86::FR32RegisterClass); 13573 case MVT::f64: 13574 case MVT::i64: 13575 return std::make_pair(0U, X86::FR64RegisterClass); 13576 // Vector types. 13577 case MVT::v16i8: 13578 case MVT::v8i16: 13579 case MVT::v4i32: 13580 case MVT::v2i64: 13581 case MVT::v4f32: 13582 case MVT::v2f64: 13583 return std::make_pair(0U, X86::VR128RegisterClass); 13584 } 13585 break; 13586 } 13587 } 13588 13589 // Use the default implementation in TargetLowering to convert the register 13590 // constraint into a member of a register class. 13591 std::pair<unsigned, const TargetRegisterClass*> Res; 13592 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 13593 13594 // Not found as a standard register? 13595 if (Res.second == 0) { 13596 // Map st(0) -> st(7) -> ST0 13597 if (Constraint.size() == 7 && Constraint[0] == '{' && 13598 tolower(Constraint[1]) == 's' && 13599 tolower(Constraint[2]) == 't' && 13600 Constraint[3] == '(' && 13601 (Constraint[4] >= '0' && Constraint[4] <= '7') && 13602 Constraint[5] == ')' && 13603 Constraint[6] == '}') { 13604 13605 Res.first = X86::ST0+Constraint[4]-'0'; 13606 Res.second = X86::RFP80RegisterClass; 13607 return Res; 13608 } 13609 13610 // GCC allows "st(0)" to be called just plain "st". 13611 if (StringRef("{st}").equals_lower(Constraint)) { 13612 Res.first = X86::ST0; 13613 Res.second = X86::RFP80RegisterClass; 13614 return Res; 13615 } 13616 13617 // flags -> EFLAGS 13618 if (StringRef("{flags}").equals_lower(Constraint)) { 13619 Res.first = X86::EFLAGS; 13620 Res.second = X86::CCRRegisterClass; 13621 return Res; 13622 } 13623 13624 // 'A' means EAX + EDX. 13625 if (Constraint == "A") { 13626 Res.first = X86::EAX; 13627 Res.second = X86::GR32_ADRegisterClass; 13628 return Res; 13629 } 13630 return Res; 13631 } 13632 13633 // Otherwise, check to see if this is a register class of the wrong value 13634 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 13635 // turn into {ax},{dx}. 13636 if (Res.second->hasType(VT)) 13637 return Res; // Correct type already, nothing to do. 13638 13639 // All of the single-register GCC register classes map their values onto 13640 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 13641 // really want an 8-bit or 32-bit register, map to the appropriate register 13642 // class and return the appropriate register. 13643 if (Res.second == X86::GR16RegisterClass) { 13644 if (VT == MVT::i8) { 13645 unsigned DestReg = 0; 13646 switch (Res.first) { 13647 default: break; 13648 case X86::AX: DestReg = X86::AL; break; 13649 case X86::DX: DestReg = X86::DL; break; 13650 case X86::CX: DestReg = X86::CL; break; 13651 case X86::BX: DestReg = X86::BL; break; 13652 } 13653 if (DestReg) { 13654 Res.first = DestReg; 13655 Res.second = X86::GR8RegisterClass; 13656 } 13657 } else if (VT == MVT::i32) { 13658 unsigned DestReg = 0; 13659 switch (Res.first) { 13660 default: break; 13661 case X86::AX: DestReg = X86::EAX; break; 13662 case X86::DX: DestReg = X86::EDX; break; 13663 case X86::CX: DestReg = X86::ECX; break; 13664 case X86::BX: DestReg = X86::EBX; break; 13665 case X86::SI: DestReg = X86::ESI; break; 13666 case X86::DI: DestReg = X86::EDI; break; 13667 case X86::BP: DestReg = X86::EBP; break; 13668 case X86::SP: DestReg = X86::ESP; break; 13669 } 13670 if (DestReg) { 13671 Res.first = DestReg; 13672 Res.second = X86::GR32RegisterClass; 13673 } 13674 } else if (VT == MVT::i64) { 13675 unsigned DestReg = 0; 13676 switch (Res.first) { 13677 default: break; 13678 case X86::AX: DestReg = X86::RAX; break; 13679 case X86::DX: DestReg = X86::RDX; break; 13680 case X86::CX: DestReg = X86::RCX; break; 13681 case X86::BX: DestReg = X86::RBX; break; 13682 case X86::SI: DestReg = X86::RSI; break; 13683 case X86::DI: DestReg = X86::RDI; break; 13684 case X86::BP: DestReg = X86::RBP; break; 13685 case X86::SP: DestReg = X86::RSP; break; 13686 } 13687 if (DestReg) { 13688 Res.first = DestReg; 13689 Res.second = X86::GR64RegisterClass; 13690 } 13691 } 13692 } else if (Res.second == X86::FR32RegisterClass || 13693 Res.second == X86::FR64RegisterClass || 13694 Res.second == X86::VR128RegisterClass) { 13695 // Handle references to XMM physical registers that got mapped into the 13696 // wrong class. This can happen with constraints like {xmm0} where the 13697 // target independent register mapper will just pick the first match it can 13698 // find, ignoring the required type. 13699 if (VT == MVT::f32) 13700 Res.second = X86::FR32RegisterClass; 13701 else if (VT == MVT::f64) 13702 Res.second = X86::FR64RegisterClass; 13703 else if (X86::VR128RegisterClass->hasType(VT)) 13704 Res.second = X86::VR128RegisterClass; 13705 } 13706 13707 return Res; 13708} 13709