X86ISelLowering.cpp revision 2e6496026f41d2c05ff038d14df9972f8a27fb94
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG); 75 76 77/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 78/// sets things up to match to an AVX VEXTRACTF128 instruction or a 79/// simple subregister reference. Idx is an index in the 128 bits we 80/// want. It need not be aligned to a 128-bit bounday. That makes 81/// lowering EXTRACT_VECTOR_ELT operations easier. 82static SDValue Extract128BitVector(SDValue Vec, 83 SDValue Idx, 84 SelectionDAG &DAG, 85 DebugLoc dl) { 86 EVT VT = Vec.getValueType(); 87 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 88 89 EVT ElVT = VT.getVectorElementType(); 90 91 int Factor = VT.getSizeInBits() / 128; 92 93 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), 94 ElVT, 95 VT.getVectorNumElements() / Factor); 96 97 // Extract from UNDEF is UNDEF. 98 if (Vec.getOpcode() == ISD::UNDEF) 99 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 100 101 if (isa<ConstantSDNode>(Idx)) { 102 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 103 104 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 105 // we can match to VEXTRACTF128. 106 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 107 108 // This is the index of the first element of the 128-bit chunk 109 // we want. 110 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 111 * ElemsPerChunk); 112 113 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 114 115 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 116 VecIdx); 117 118 return Result; 119 } 120 121 return SDValue(); 122} 123 124/// Generate a DAG to put 128-bits into a vector > 128 bits. This 125/// sets things up to match to an AVX VINSERTF128 instruction or a 126/// simple superregister reference. Idx is an index in the 128 bits 127/// we want. It need not be aligned to a 128-bit bounday. That makes 128/// lowering INSERT_VECTOR_ELT operations easier. 129static SDValue Insert128BitVector(SDValue Result, 130 SDValue Vec, 131 SDValue Idx, 132 SelectionDAG &DAG, 133 DebugLoc dl) { 134 if (isa<ConstantSDNode>(Idx)) { 135 EVT VT = Vec.getValueType(); 136 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 137 138 EVT ElVT = VT.getVectorElementType(); 139 140 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 141 142 EVT ResultVT = Result.getValueType(); 143 144 // Insert the relevant 128 bits. 145 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 146 147 // This is the index of the first element of the 128-bit chunk 148 // we want. 149 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 150 * ElemsPerChunk); 151 152 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 153 154 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 155 VecIdx); 156 return Result; 157 } 158 159 return SDValue(); 160} 161 162/// Given two vectors, concat them. 163static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) { 164 DebugLoc dl = Lower.getDebugLoc(); 165 166 assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!"); 167 168 EVT VT = EVT::getVectorVT(*DAG.getContext(), 169 Lower.getValueType().getVectorElementType(), 170 Lower.getValueType().getVectorNumElements() * 2); 171 172 // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors). 173 assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!"); 174 175 // Insert the upper subvector. 176 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper, 177 DAG.getConstant( 178 // This is half the length of the result 179 // vector. Start inserting the upper 128 180 // bits here. 181 Lower.getValueType().getVectorNumElements(), 182 MVT::i32), 183 DAG, dl); 184 185 // Insert the lower subvector. 186 Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl); 187 return Vec; 188} 189 190static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 191 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 192 bool is64Bit = Subtarget->is64Bit(); 193 194 if (Subtarget->isTargetEnvMacho()) { 195 if (is64Bit) 196 return new X8664_MachoTargetObjectFile(); 197 return new TargetLoweringObjectFileMachO(); 198 } 199 200 if (Subtarget->isTargetELF()) { 201 if (is64Bit) 202 return new X8664_ELFTargetObjectFile(TM); 203 return new X8632_ELFTargetObjectFile(TM); 204 } 205 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 206 return new TargetLoweringObjectFileCOFF(); 207 llvm_unreachable("unknown subtarget type"); 208} 209 210X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 211 : TargetLowering(TM, createTLOF(TM)) { 212 Subtarget = &TM.getSubtarget<X86Subtarget>(); 213 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 214 X86ScalarSSEf32 = Subtarget->hasXMM(); 215 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 216 217 RegInfo = TM.getRegisterInfo(); 218 TD = getTargetData(); 219 220 // Set up the TargetLowering object. 221 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 222 223 // X86 is weird, it always uses i8 for shift amounts and setcc results. 224 setBooleanContents(ZeroOrOneBooleanContent); 225 226 // For 64-bit since we have so many registers use the ILP scheduler, for 227 // 32-bit code use the register pressure specific scheduling. 228 if (Subtarget->is64Bit()) 229 setSchedulingPreference(Sched::ILP); 230 else 231 setSchedulingPreference(Sched::RegPressure); 232 setStackPointerRegisterToSaveRestore(X86StackPtr); 233 234 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 235 // Setup Windows compiler runtime calls. 236 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 237 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 238 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 239 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 240 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 241 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 242 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 243 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 244 } 245 246 if (Subtarget->isTargetDarwin()) { 247 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 248 setUseUnderscoreSetJmp(false); 249 setUseUnderscoreLongJmp(false); 250 } else if (Subtarget->isTargetMingw()) { 251 // MS runtime is weird: it exports _setjmp, but longjmp! 252 setUseUnderscoreSetJmp(true); 253 setUseUnderscoreLongJmp(false); 254 } else { 255 setUseUnderscoreSetJmp(true); 256 setUseUnderscoreLongJmp(true); 257 } 258 259 // Set up the register classes. 260 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 261 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 262 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 263 if (Subtarget->is64Bit()) 264 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 265 266 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 267 268 // We don't accept any truncstore of integer registers. 269 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 270 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 271 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 272 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 273 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 274 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 275 276 // SETOEQ and SETUNE require checking two conditions. 277 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 278 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 279 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 280 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 281 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 282 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 283 284 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 285 // operation. 286 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 287 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 288 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 289 290 if (Subtarget->is64Bit()) { 291 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 292 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 293 } else if (!UseSoftFloat) { 294 // We have an algorithm for SSE2->double, and we turn this into a 295 // 64-bit FILD followed by conditional FADD for other targets. 296 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 297 // We have an algorithm for SSE2, and we turn this into a 64-bit 298 // FILD for other targets. 299 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 300 } 301 302 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 303 // this operation. 304 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 305 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 306 307 if (!UseSoftFloat) { 308 // SSE has no i16 to fp conversion, only i32 309 if (X86ScalarSSEf32) { 310 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 311 // f32 and f64 cases are Legal, f80 case is not 312 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 313 } else { 314 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 315 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 316 } 317 } else { 318 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 319 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 320 } 321 322 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 323 // are Legal, f80 is custom lowered. 324 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 325 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 326 327 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 328 // this operation. 329 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 330 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 331 332 if (X86ScalarSSEf32) { 333 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 334 // f32 and f64 cases are Legal, f80 case is not 335 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 336 } else { 337 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 338 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 339 } 340 341 // Handle FP_TO_UINT by promoting the destination to a larger signed 342 // conversion. 343 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 344 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 345 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 346 347 if (Subtarget->is64Bit()) { 348 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 349 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 350 } else if (!UseSoftFloat) { 351 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 352 // Expand FP_TO_UINT into a select. 353 // FIXME: We would like to use a Custom expander here eventually to do 354 // the optimal thing for SSE vs. the default expansion in the legalizer. 355 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 356 else 357 // With SSE3 we can use fisttpll to convert to a signed i64; without 358 // SSE, we're stuck with a fistpll. 359 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 360 } 361 362 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 363 if (!X86ScalarSSEf64) { 364 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 365 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 366 if (Subtarget->is64Bit()) { 367 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 368 // Without SSE, i64->f64 goes through memory. 369 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 370 } 371 } 372 373 // Scalar integer divide and remainder are lowered to use operations that 374 // produce two results, to match the available instructions. This exposes 375 // the two-result form to trivial CSE, which is able to combine x/y and x%y 376 // into a single instruction. 377 // 378 // Scalar integer multiply-high is also lowered to use two-result 379 // operations, to match the available instructions. However, plain multiply 380 // (low) operations are left as Legal, as there are single-result 381 // instructions for this in x86. Using the two-result multiply instructions 382 // when both high and low results are needed must be arranged by dagcombine. 383 for (unsigned i = 0, e = 4; i != e; ++i) { 384 MVT VT = IntVTs[i]; 385 setOperationAction(ISD::MULHS, VT, Expand); 386 setOperationAction(ISD::MULHU, VT, Expand); 387 setOperationAction(ISD::SDIV, VT, Expand); 388 setOperationAction(ISD::UDIV, VT, Expand); 389 setOperationAction(ISD::SREM, VT, Expand); 390 setOperationAction(ISD::UREM, VT, Expand); 391 392 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 393 setOperationAction(ISD::ADDC, VT, Custom); 394 setOperationAction(ISD::ADDE, VT, Custom); 395 setOperationAction(ISD::SUBC, VT, Custom); 396 setOperationAction(ISD::SUBE, VT, Custom); 397 } 398 399 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 400 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 401 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 402 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 403 if (Subtarget->is64Bit()) 404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 408 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 409 setOperationAction(ISD::FREM , MVT::f32 , Expand); 410 setOperationAction(ISD::FREM , MVT::f64 , Expand); 411 setOperationAction(ISD::FREM , MVT::f80 , Expand); 412 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 413 414 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 415 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 416 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 417 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 418 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 419 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 420 if (Subtarget->is64Bit()) { 421 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 422 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 423 } 424 425 if (Subtarget->hasPOPCNT()) { 426 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 427 } else { 428 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 429 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 430 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 431 if (Subtarget->is64Bit()) 432 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 433 } 434 435 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 436 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 437 438 // These should be promoted to a larger select which is supported. 439 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 440 // X86 wants to expand cmov itself. 441 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 442 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 443 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 444 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 445 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 446 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 447 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 448 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 449 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 450 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 451 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 452 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 453 if (Subtarget->is64Bit()) { 454 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 455 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 456 } 457 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 458 459 // Darwin ABI issue. 460 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 461 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 462 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 463 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 464 if (Subtarget->is64Bit()) 465 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 466 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 467 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 468 if (Subtarget->is64Bit()) { 469 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 470 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 471 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 472 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 473 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 474 } 475 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 476 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 477 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 478 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 479 if (Subtarget->is64Bit()) { 480 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 481 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 482 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 483 } 484 485 if (Subtarget->hasXMM()) 486 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 487 488 // We may not have a libcall for MEMBARRIER so we should lower this. 489 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 490 491 // On X86 and X86-64, atomic operations are lowered to locked instructions. 492 // Locked instructions, in turn, have implicit fence semantics (all memory 493 // operations are flushed before issuing the locked instruction, and they 494 // are not buffered), so we can fold away the common pattern of 495 // fence-atomic-fence. 496 setShouldFoldAtomicFences(true); 497 498 // Expand certain atomics 499 for (unsigned i = 0, e = 4; i != e; ++i) { 500 MVT VT = IntVTs[i]; 501 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 502 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 503 } 504 505 if (!Subtarget->is64Bit()) { 506 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 507 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 508 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 509 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 511 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 512 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 513 } 514 515 // FIXME - use subtarget debug flags 516 if (!Subtarget->isTargetDarwin() && 517 !Subtarget->isTargetELF() && 518 !Subtarget->isTargetCygMing()) { 519 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 520 } 521 522 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 523 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 524 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 525 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 526 if (Subtarget->is64Bit()) { 527 setExceptionPointerRegister(X86::RAX); 528 setExceptionSelectorRegister(X86::RDX); 529 } else { 530 setExceptionPointerRegister(X86::EAX); 531 setExceptionSelectorRegister(X86::EDX); 532 } 533 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 534 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 535 536 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 537 538 setOperationAction(ISD::TRAP, MVT::Other, Legal); 539 540 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 541 setOperationAction(ISD::VASTART , MVT::Other, Custom); 542 setOperationAction(ISD::VAEND , MVT::Other, Expand); 543 if (Subtarget->is64Bit()) { 544 setOperationAction(ISD::VAARG , MVT::Other, Custom); 545 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 546 } else { 547 setOperationAction(ISD::VAARG , MVT::Other, Expand); 548 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 549 } 550 551 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 552 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 553 setOperationAction(ISD::DYNAMIC_STACKALLOC, 554 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 555 (Subtarget->isTargetCOFF() 556 && !Subtarget->isTargetEnvMacho() 557 ? Custom : Expand)); 558 559 if (!UseSoftFloat && X86ScalarSSEf64) { 560 // f32 and f64 use SSE. 561 // Set up the FP register classes. 562 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 563 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 564 565 // Use ANDPD to simulate FABS. 566 setOperationAction(ISD::FABS , MVT::f64, Custom); 567 setOperationAction(ISD::FABS , MVT::f32, Custom); 568 569 // Use XORP to simulate FNEG. 570 setOperationAction(ISD::FNEG , MVT::f64, Custom); 571 setOperationAction(ISD::FNEG , MVT::f32, Custom); 572 573 // Use ANDPD and ORPD to simulate FCOPYSIGN. 574 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 575 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 576 577 // We don't support sin/cos/fmod 578 setOperationAction(ISD::FSIN , MVT::f64, Expand); 579 setOperationAction(ISD::FCOS , MVT::f64, Expand); 580 setOperationAction(ISD::FSIN , MVT::f32, Expand); 581 setOperationAction(ISD::FCOS , MVT::f32, Expand); 582 583 // Expand FP immediates into loads from the stack, except for the special 584 // cases we handle. 585 addLegalFPImmediate(APFloat(+0.0)); // xorpd 586 addLegalFPImmediate(APFloat(+0.0f)); // xorps 587 } else if (!UseSoftFloat && X86ScalarSSEf32) { 588 // Use SSE for f32, x87 for f64. 589 // Set up the FP register classes. 590 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 593 // Use ANDPS to simulate FABS. 594 setOperationAction(ISD::FABS , MVT::f32, Custom); 595 596 // Use XORP to simulate FNEG. 597 setOperationAction(ISD::FNEG , MVT::f32, Custom); 598 599 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 600 601 // Use ANDPS and ORPS to simulate FCOPYSIGN. 602 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 603 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 604 605 // We don't support sin/cos/fmod 606 setOperationAction(ISD::FSIN , MVT::f32, Expand); 607 setOperationAction(ISD::FCOS , MVT::f32, Expand); 608 609 // Special cases we handle for FP constants. 610 addLegalFPImmediate(APFloat(+0.0f)); // xorps 611 addLegalFPImmediate(APFloat(+0.0)); // FLD0 612 addLegalFPImmediate(APFloat(+1.0)); // FLD1 613 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 614 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 615 616 if (!UnsafeFPMath) { 617 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 618 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 619 } 620 } else if (!UseSoftFloat) { 621 // f32 and f64 in x87. 622 // Set up the FP register classes. 623 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 624 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 625 626 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 627 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 628 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 629 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 630 631 if (!UnsafeFPMath) { 632 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 633 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 634 } 635 addLegalFPImmediate(APFloat(+0.0)); // FLD0 636 addLegalFPImmediate(APFloat(+1.0)); // FLD1 637 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 638 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 639 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 640 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 641 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 642 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 643 } 644 645 // Long double always uses X87. 646 if (!UseSoftFloat) { 647 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 648 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 649 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 650 { 651 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 652 addLegalFPImmediate(TmpFlt); // FLD0 653 TmpFlt.changeSign(); 654 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 655 656 bool ignored; 657 APFloat TmpFlt2(+1.0); 658 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 659 &ignored); 660 addLegalFPImmediate(TmpFlt2); // FLD1 661 TmpFlt2.changeSign(); 662 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 663 } 664 665 if (!UnsafeFPMath) { 666 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 667 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 668 } 669 } 670 671 // Always use a library call for pow. 672 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 673 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 674 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 675 676 setOperationAction(ISD::FLOG, MVT::f80, Expand); 677 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 678 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 679 setOperationAction(ISD::FEXP, MVT::f80, Expand); 680 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 681 682 // First set operation action for all vector types to either promote 683 // (for widening) or expand (for scalarization). Then we will selectively 684 // turn on ones that can be effectively codegen'd. 685 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 686 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 687 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 702 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 704 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 705 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 711 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 717 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 721 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 722 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 725 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 727 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 728 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 729 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 730 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 731 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 737 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 739 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 740 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 741 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 742 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 743 setTruncStoreAction((MVT::SimpleValueType)VT, 744 (MVT::SimpleValueType)InnerVT, Expand); 745 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 746 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 747 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 748 } 749 750 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 751 // with -msoft-float, disable use of MMX as well. 752 if (!UseSoftFloat && Subtarget->hasMMX()) { 753 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 754 // No operations on x86mmx supported, everything uses intrinsics. 755 } 756 757 // MMX-sized vectors (other than x86mmx) are expected to be expanded 758 // into smaller operations. 759 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 760 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 761 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 762 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 763 setOperationAction(ISD::AND, MVT::v8i8, Expand); 764 setOperationAction(ISD::AND, MVT::v4i16, Expand); 765 setOperationAction(ISD::AND, MVT::v2i32, Expand); 766 setOperationAction(ISD::AND, MVT::v1i64, Expand); 767 setOperationAction(ISD::OR, MVT::v8i8, Expand); 768 setOperationAction(ISD::OR, MVT::v4i16, Expand); 769 setOperationAction(ISD::OR, MVT::v2i32, Expand); 770 setOperationAction(ISD::OR, MVT::v1i64, Expand); 771 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 772 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 773 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 774 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 775 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 776 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 777 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 778 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 779 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 780 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 781 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 782 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 783 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 784 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 785 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 786 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 787 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 788 789 if (!UseSoftFloat && Subtarget->hasXMM()) { 790 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 791 792 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 793 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 794 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 795 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 796 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 797 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 798 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 799 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 800 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 801 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 802 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 803 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 804 } 805 806 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 807 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 808 809 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 810 // registers cannot be used even for integer operations. 811 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 812 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 813 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 814 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 815 816 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 817 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 818 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 819 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 820 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 821 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 822 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 823 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 824 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 825 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 826 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 827 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 828 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 829 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 830 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 831 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 832 833 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 834 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 835 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 836 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 837 838 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 839 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 840 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 841 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 842 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 843 844 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 845 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 846 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 847 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 848 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 849 850 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 851 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 852 EVT VT = (MVT::SimpleValueType)i; 853 // Do not attempt to custom lower non-power-of-2 vectors 854 if (!isPowerOf2_32(VT.getVectorNumElements())) 855 continue; 856 // Do not attempt to custom lower non-128-bit vectors 857 if (!VT.is128BitVector()) 858 continue; 859 setOperationAction(ISD::BUILD_VECTOR, 860 VT.getSimpleVT().SimpleTy, Custom); 861 setOperationAction(ISD::VECTOR_SHUFFLE, 862 VT.getSimpleVT().SimpleTy, Custom); 863 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 864 VT.getSimpleVT().SimpleTy, Custom); 865 } 866 867 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 868 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 869 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 870 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 871 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 872 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 873 874 if (Subtarget->is64Bit()) { 875 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 876 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 877 } 878 879 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 880 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 881 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 882 EVT VT = SVT; 883 884 // Do not attempt to promote non-128-bit vectors 885 if (!VT.is128BitVector()) 886 continue; 887 888 setOperationAction(ISD::AND, SVT, Promote); 889 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 890 setOperationAction(ISD::OR, SVT, Promote); 891 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 892 setOperationAction(ISD::XOR, SVT, Promote); 893 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 894 setOperationAction(ISD::LOAD, SVT, Promote); 895 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 896 setOperationAction(ISD::SELECT, SVT, Promote); 897 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 898 } 899 900 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 901 902 // Custom lower v2i64 and v2f64 selects. 903 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 904 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 905 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 906 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 907 908 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 909 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 910 } 911 912 if (Subtarget->hasSSE41()) { 913 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 914 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 915 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 916 setOperationAction(ISD::FRINT, MVT::f32, Legal); 917 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 918 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 919 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 920 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 921 setOperationAction(ISD::FRINT, MVT::f64, Legal); 922 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 923 924 // FIXME: Do we need to handle scalar-to-vector here? 925 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 926 927 // Can turn SHL into an integer multiply. 928 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 929 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 930 931 // i8 and i16 vectors are custom , because the source register and source 932 // source memory operand types are not the same width. f32 vectors are 933 // custom since the immediate controlling the insert encodes additional 934 // information. 935 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 936 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 937 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 938 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 939 940 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 942 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 943 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 944 945 if (Subtarget->is64Bit()) { 946 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 947 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 948 } 949 } 950 951 if (Subtarget->hasSSE2()) { 952 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 953 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 954 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 955 956 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 957 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 958 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 959 960 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 961 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 962 } 963 964 if (Subtarget->hasSSE42()) 965 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 966 967 if (!UseSoftFloat && Subtarget->hasAVX()) { 968 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 969 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 970 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 971 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 972 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 973 974 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 975 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 976 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 977 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 978 979 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 980 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 981 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 982 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 983 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 984 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 985 986 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 987 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 988 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 989 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 990 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 991 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 992 993 // Custom lower build_vector, vector_shuffle, scalar_to_vector, 994 // insert_vector_elt extract_subvector and extract_vector_elt for 995 // 256-bit types. 996 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 997 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 998 ++i) { 999 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 1000 // Do not attempt to custom lower non-256-bit vectors 1001 if (!isPowerOf2_32(MVT(VT).getVectorNumElements()) 1002 || (MVT(VT).getSizeInBits() < 256)) 1003 continue; 1004 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1005 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1006 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1007 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1008 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1009 } 1010 // Custom-lower insert_subvector and extract_subvector based on 1011 // the result type. 1012 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1013 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 1014 ++i) { 1015 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 1016 // Do not attempt to custom lower non-256-bit vectors 1017 if (!isPowerOf2_32(MVT(VT).getVectorNumElements())) 1018 continue; 1019 1020 if (MVT(VT).getSizeInBits() == 128) { 1021 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1022 } 1023 else if (MVT(VT).getSizeInBits() == 256) { 1024 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1025 } 1026 } 1027 1028 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1029 // Don't promote loads because we need them for VPERM vector index versions. 1030 1031 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1032 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; 1033 VT++) { 1034 if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements()) 1035 || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256)) 1036 continue; 1037 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 1038 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64); 1039 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 1040 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64); 1041 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 1042 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64); 1043 //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 1044 //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64); 1045 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 1046 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64); 1047 } 1048 } 1049 1050 // We want to custom lower some of our intrinsics. 1051 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1052 1053 1054 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1055 // handle type legalization for these operations here. 1056 // 1057 // FIXME: We really should do custom legalization for addition and 1058 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1059 // than generic legalization for 64-bit multiplication-with-overflow, though. 1060 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1061 // Add/Sub/Mul with overflow operations are custom lowered. 1062 MVT VT = IntVTs[i]; 1063 setOperationAction(ISD::SADDO, VT, Custom); 1064 setOperationAction(ISD::UADDO, VT, Custom); 1065 setOperationAction(ISD::SSUBO, VT, Custom); 1066 setOperationAction(ISD::USUBO, VT, Custom); 1067 setOperationAction(ISD::SMULO, VT, Custom); 1068 setOperationAction(ISD::UMULO, VT, Custom); 1069 } 1070 1071 // There are no 8-bit 3-address imul/mul instructions 1072 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1073 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1074 1075 if (!Subtarget->is64Bit()) { 1076 // These libcalls are not available in 32-bit. 1077 setLibcallName(RTLIB::SHL_I128, 0); 1078 setLibcallName(RTLIB::SRL_I128, 0); 1079 setLibcallName(RTLIB::SRA_I128, 0); 1080 } 1081 1082 // We have target-specific dag combine patterns for the following nodes: 1083 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1084 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1085 setTargetDAGCombine(ISD::BUILD_VECTOR); 1086 setTargetDAGCombine(ISD::SELECT); 1087 setTargetDAGCombine(ISD::SHL); 1088 setTargetDAGCombine(ISD::SRA); 1089 setTargetDAGCombine(ISD::SRL); 1090 setTargetDAGCombine(ISD::OR); 1091 setTargetDAGCombine(ISD::AND); 1092 setTargetDAGCombine(ISD::ADD); 1093 setTargetDAGCombine(ISD::SUB); 1094 setTargetDAGCombine(ISD::STORE); 1095 setTargetDAGCombine(ISD::ZERO_EXTEND); 1096 if (Subtarget->is64Bit()) 1097 setTargetDAGCombine(ISD::MUL); 1098 1099 computeRegisterProperties(); 1100 1101 // On Darwin, -Os means optimize for size without hurting performance, 1102 // do not reduce the limit. 1103 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1104 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1105 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1106 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1107 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1108 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1109 setPrefLoopAlignment(16); 1110 benefitFromCodePlacementOpt = true; 1111 1112 setPrefFunctionAlignment(4); 1113} 1114 1115 1116MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1117 return MVT::i8; 1118} 1119 1120 1121/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1122/// the desired ByVal argument alignment. 1123static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1124 if (MaxAlign == 16) 1125 return; 1126 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1127 if (VTy->getBitWidth() == 128) 1128 MaxAlign = 16; 1129 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1130 unsigned EltAlign = 0; 1131 getMaxByValAlign(ATy->getElementType(), EltAlign); 1132 if (EltAlign > MaxAlign) 1133 MaxAlign = EltAlign; 1134 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1135 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1136 unsigned EltAlign = 0; 1137 getMaxByValAlign(STy->getElementType(i), EltAlign); 1138 if (EltAlign > MaxAlign) 1139 MaxAlign = EltAlign; 1140 if (MaxAlign == 16) 1141 break; 1142 } 1143 } 1144 return; 1145} 1146 1147/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1148/// function arguments in the caller parameter area. For X86, aggregates 1149/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1150/// are at 4-byte boundaries. 1151unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1152 if (Subtarget->is64Bit()) { 1153 // Max of 8 and alignment of type. 1154 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1155 if (TyAlign > 8) 1156 return TyAlign; 1157 return 8; 1158 } 1159 1160 unsigned Align = 4; 1161 if (Subtarget->hasXMM()) 1162 getMaxByValAlign(Ty, Align); 1163 return Align; 1164} 1165 1166/// getOptimalMemOpType - Returns the target specific optimal type for load 1167/// and store operations as a result of memset, memcpy, and memmove 1168/// lowering. If DstAlign is zero that means it's safe to destination 1169/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1170/// means there isn't a need to check it against alignment requirement, 1171/// probably because the source does not need to be loaded. If 1172/// 'NonScalarIntSafe' is true, that means it's safe to return a 1173/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1174/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1175/// constant so it does not need to be loaded. 1176/// It returns EVT::Other if the type should be determined using generic 1177/// target-independent logic. 1178EVT 1179X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1180 unsigned DstAlign, unsigned SrcAlign, 1181 bool NonScalarIntSafe, 1182 bool MemcpyStrSrc, 1183 MachineFunction &MF) const { 1184 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1185 // linux. This is because the stack realignment code can't handle certain 1186 // cases like PR2962. This should be removed when PR2962 is fixed. 1187 const Function *F = MF.getFunction(); 1188 if (NonScalarIntSafe && 1189 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1190 if (Size >= 16 && 1191 (Subtarget->isUnalignedMemAccessFast() || 1192 ((DstAlign == 0 || DstAlign >= 16) && 1193 (SrcAlign == 0 || SrcAlign >= 16))) && 1194 Subtarget->getStackAlignment() >= 16) { 1195 if (Subtarget->hasSSE2()) 1196 return MVT::v4i32; 1197 if (Subtarget->hasSSE1()) 1198 return MVT::v4f32; 1199 } else if (!MemcpyStrSrc && Size >= 8 && 1200 !Subtarget->is64Bit() && 1201 Subtarget->getStackAlignment() >= 8 && 1202 Subtarget->hasXMMInt()) { 1203 // Do not use f64 to lower memcpy if source is string constant. It's 1204 // better to use i32 to avoid the loads. 1205 return MVT::f64; 1206 } 1207 } 1208 if (Subtarget->is64Bit() && Size >= 8) 1209 return MVT::i64; 1210 return MVT::i32; 1211} 1212 1213/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1214/// current function. The returned value is a member of the 1215/// MachineJumpTableInfo::JTEntryKind enum. 1216unsigned X86TargetLowering::getJumpTableEncoding() const { 1217 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1218 // symbol. 1219 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1220 Subtarget->isPICStyleGOT()) 1221 return MachineJumpTableInfo::EK_Custom32; 1222 1223 // Otherwise, use the normal jump table encoding heuristics. 1224 return TargetLowering::getJumpTableEncoding(); 1225} 1226 1227const MCExpr * 1228X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1229 const MachineBasicBlock *MBB, 1230 unsigned uid,MCContext &Ctx) const{ 1231 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1232 Subtarget->isPICStyleGOT()); 1233 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1234 // entries. 1235 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1236 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1237} 1238 1239/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1240/// jumptable. 1241SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1242 SelectionDAG &DAG) const { 1243 if (!Subtarget->is64Bit()) 1244 // This doesn't have DebugLoc associated with it, but is not really the 1245 // same as a Register. 1246 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1247 return Table; 1248} 1249 1250/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1251/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1252/// MCExpr. 1253const MCExpr *X86TargetLowering:: 1254getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1255 MCContext &Ctx) const { 1256 // X86-64 uses RIP relative addressing based on the jump table label. 1257 if (Subtarget->isPICStyleRIPRel()) 1258 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1259 1260 // Otherwise, the reference is relative to the PIC base. 1261 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1262} 1263 1264// FIXME: Why this routine is here? Move to RegInfo! 1265std::pair<const TargetRegisterClass*, uint8_t> 1266X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1267 const TargetRegisterClass *RRC = 0; 1268 uint8_t Cost = 1; 1269 switch (VT.getSimpleVT().SimpleTy) { 1270 default: 1271 return TargetLowering::findRepresentativeClass(VT); 1272 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1273 RRC = (Subtarget->is64Bit() 1274 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1275 break; 1276 case MVT::x86mmx: 1277 RRC = X86::VR64RegisterClass; 1278 break; 1279 case MVT::f32: case MVT::f64: 1280 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1281 case MVT::v4f32: case MVT::v2f64: 1282 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1283 case MVT::v4f64: 1284 RRC = X86::VR128RegisterClass; 1285 break; 1286 } 1287 return std::make_pair(RRC, Cost); 1288} 1289 1290bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1291 unsigned &Offset) const { 1292 if (!Subtarget->isTargetLinux()) 1293 return false; 1294 1295 if (Subtarget->is64Bit()) { 1296 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1297 Offset = 0x28; 1298 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1299 AddressSpace = 256; 1300 else 1301 AddressSpace = 257; 1302 } else { 1303 // %gs:0x14 on i386 1304 Offset = 0x14; 1305 AddressSpace = 256; 1306 } 1307 return true; 1308} 1309 1310 1311//===----------------------------------------------------------------------===// 1312// Return Value Calling Convention Implementation 1313//===----------------------------------------------------------------------===// 1314 1315#include "X86GenCallingConv.inc" 1316 1317bool 1318X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1319 const SmallVectorImpl<ISD::OutputArg> &Outs, 1320 LLVMContext &Context) const { 1321 SmallVector<CCValAssign, 16> RVLocs; 1322 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1323 RVLocs, Context); 1324 return CCInfo.CheckReturn(Outs, RetCC_X86); 1325} 1326 1327SDValue 1328X86TargetLowering::LowerReturn(SDValue Chain, 1329 CallingConv::ID CallConv, bool isVarArg, 1330 const SmallVectorImpl<ISD::OutputArg> &Outs, 1331 const SmallVectorImpl<SDValue> &OutVals, 1332 DebugLoc dl, SelectionDAG &DAG) const { 1333 MachineFunction &MF = DAG.getMachineFunction(); 1334 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1335 1336 SmallVector<CCValAssign, 16> RVLocs; 1337 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1338 RVLocs, *DAG.getContext()); 1339 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1340 1341 // Add the regs to the liveout set for the function. 1342 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1343 for (unsigned i = 0; i != RVLocs.size(); ++i) 1344 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1345 MRI.addLiveOut(RVLocs[i].getLocReg()); 1346 1347 SDValue Flag; 1348 1349 SmallVector<SDValue, 6> RetOps; 1350 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1351 // Operand #1 = Bytes To Pop 1352 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1353 MVT::i16)); 1354 1355 // Copy the result values into the output registers. 1356 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1357 CCValAssign &VA = RVLocs[i]; 1358 assert(VA.isRegLoc() && "Can only return in registers!"); 1359 SDValue ValToCopy = OutVals[i]; 1360 EVT ValVT = ValToCopy.getValueType(); 1361 1362 // If this is x86-64, and we disabled SSE, we can't return FP values, 1363 // or SSE or MMX vectors. 1364 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1365 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1366 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1367 report_fatal_error("SSE register return with SSE disabled"); 1368 } 1369 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1370 // llvm-gcc has never done it right and no one has noticed, so this 1371 // should be OK for now. 1372 if (ValVT == MVT::f64 && 1373 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1374 report_fatal_error("SSE2 register return with SSE2 disabled"); 1375 1376 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1377 // the RET instruction and handled by the FP Stackifier. 1378 if (VA.getLocReg() == X86::ST0 || 1379 VA.getLocReg() == X86::ST1) { 1380 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1381 // change the value to the FP stack register class. 1382 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1383 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1384 RetOps.push_back(ValToCopy); 1385 // Don't emit a copytoreg. 1386 continue; 1387 } 1388 1389 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1390 // which is returned in RAX / RDX. 1391 if (Subtarget->is64Bit()) { 1392 if (ValVT == MVT::x86mmx) { 1393 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1394 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1395 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1396 ValToCopy); 1397 // If we don't have SSE2 available, convert to v4f32 so the generated 1398 // register is legal. 1399 if (!Subtarget->hasSSE2()) 1400 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1401 } 1402 } 1403 } 1404 1405 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1406 Flag = Chain.getValue(1); 1407 } 1408 1409 // The x86-64 ABI for returning structs by value requires that we copy 1410 // the sret argument into %rax for the return. We saved the argument into 1411 // a virtual register in the entry block, so now we copy the value out 1412 // and into %rax. 1413 if (Subtarget->is64Bit() && 1414 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1415 MachineFunction &MF = DAG.getMachineFunction(); 1416 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1417 unsigned Reg = FuncInfo->getSRetReturnReg(); 1418 assert(Reg && 1419 "SRetReturnReg should have been set in LowerFormalArguments()."); 1420 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1421 1422 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1423 Flag = Chain.getValue(1); 1424 1425 // RAX now acts like a return value. 1426 MRI.addLiveOut(X86::RAX); 1427 } 1428 1429 RetOps[0] = Chain; // Update chain. 1430 1431 // Add the flag if we have it. 1432 if (Flag.getNode()) 1433 RetOps.push_back(Flag); 1434 1435 return DAG.getNode(X86ISD::RET_FLAG, dl, 1436 MVT::Other, &RetOps[0], RetOps.size()); 1437} 1438 1439bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1440 if (N->getNumValues() != 1) 1441 return false; 1442 if (!N->hasNUsesOfValue(1, 0)) 1443 return false; 1444 1445 SDNode *Copy = *N->use_begin(); 1446 if (Copy->getOpcode() != ISD::CopyToReg && 1447 Copy->getOpcode() != ISD::FP_EXTEND) 1448 return false; 1449 1450 bool HasRet = false; 1451 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1452 UI != UE; ++UI) { 1453 if (UI->getOpcode() != X86ISD::RET_FLAG) 1454 return false; 1455 HasRet = true; 1456 } 1457 1458 return HasRet; 1459} 1460 1461EVT 1462X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1463 ISD::NodeType ExtendKind) const { 1464 MVT ReturnMVT; 1465 // TODO: Is this also valid on 32-bit? 1466 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1467 ReturnMVT = MVT::i8; 1468 else 1469 ReturnMVT = MVT::i32; 1470 1471 EVT MinVT = getRegisterType(Context, ReturnMVT); 1472 return VT.bitsLT(MinVT) ? MinVT : VT; 1473} 1474 1475/// LowerCallResult - Lower the result values of a call into the 1476/// appropriate copies out of appropriate physical registers. 1477/// 1478SDValue 1479X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1480 CallingConv::ID CallConv, bool isVarArg, 1481 const SmallVectorImpl<ISD::InputArg> &Ins, 1482 DebugLoc dl, SelectionDAG &DAG, 1483 SmallVectorImpl<SDValue> &InVals) const { 1484 1485 // Assign locations to each value returned by this call. 1486 SmallVector<CCValAssign, 16> RVLocs; 1487 bool Is64Bit = Subtarget->is64Bit(); 1488 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1489 RVLocs, *DAG.getContext()); 1490 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1491 1492 // Copy all of the result registers out of their specified physreg. 1493 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1494 CCValAssign &VA = RVLocs[i]; 1495 EVT CopyVT = VA.getValVT(); 1496 1497 // If this is x86-64, and we disabled SSE, we can't return FP values 1498 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1499 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1500 report_fatal_error("SSE register return with SSE disabled"); 1501 } 1502 1503 SDValue Val; 1504 1505 // If this is a call to a function that returns an fp value on the floating 1506 // point stack, we must guarantee the the value is popped from the stack, so 1507 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1508 // if the return value is not used. We use the FpGET_ST0 instructions 1509 // instead. 1510 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1511 // If we prefer to use the value in xmm registers, copy it out as f80 and 1512 // use a truncate to move it from fp stack reg to xmm reg. 1513 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1514 bool isST0 = VA.getLocReg() == X86::ST0; 1515 unsigned Opc = 0; 1516 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1517 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1518 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1519 SDValue Ops[] = { Chain, InFlag }; 1520 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue, 1521 Ops, 2), 1); 1522 Val = Chain.getValue(0); 1523 1524 // Round the f80 to the right size, which also moves it to the appropriate 1525 // xmm register. 1526 if (CopyVT != VA.getValVT()) 1527 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1528 // This truncation won't change the value. 1529 DAG.getIntPtrConstant(1)); 1530 } else { 1531 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1532 CopyVT, InFlag).getValue(1); 1533 Val = Chain.getValue(0); 1534 } 1535 InFlag = Chain.getValue(2); 1536 InVals.push_back(Val); 1537 } 1538 1539 return Chain; 1540} 1541 1542 1543//===----------------------------------------------------------------------===// 1544// C & StdCall & Fast Calling Convention implementation 1545//===----------------------------------------------------------------------===// 1546// StdCall calling convention seems to be standard for many Windows' API 1547// routines and around. It differs from C calling convention just a little: 1548// callee should clean up the stack, not caller. Symbols should be also 1549// decorated in some fancy way :) It doesn't support any vector arguments. 1550// For info on fast calling convention see Fast Calling Convention (tail call) 1551// implementation LowerX86_32FastCCCallTo. 1552 1553/// CallIsStructReturn - Determines whether a call uses struct return 1554/// semantics. 1555static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1556 if (Outs.empty()) 1557 return false; 1558 1559 return Outs[0].Flags.isSRet(); 1560} 1561 1562/// ArgsAreStructReturn - Determines whether a function uses struct 1563/// return semantics. 1564static bool 1565ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1566 if (Ins.empty()) 1567 return false; 1568 1569 return Ins[0].Flags.isSRet(); 1570} 1571 1572/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1573/// by "Src" to address "Dst" with size and alignment information specified by 1574/// the specific parameter attribute. The copy will be passed as a byval 1575/// function parameter. 1576static SDValue 1577CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1578 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1579 DebugLoc dl) { 1580 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1581 1582 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1583 /*isVolatile*/false, /*AlwaysInline=*/true, 1584 MachinePointerInfo(), MachinePointerInfo()); 1585} 1586 1587/// IsTailCallConvention - Return true if the calling convention is one that 1588/// supports tail call optimization. 1589static bool IsTailCallConvention(CallingConv::ID CC) { 1590 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1591} 1592 1593bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1594 if (!CI->isTailCall()) 1595 return false; 1596 1597 CallSite CS(CI); 1598 CallingConv::ID CalleeCC = CS.getCallingConv(); 1599 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1600 return false; 1601 1602 return true; 1603} 1604 1605/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1606/// a tailcall target by changing its ABI. 1607static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1608 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1609} 1610 1611SDValue 1612X86TargetLowering::LowerMemArgument(SDValue Chain, 1613 CallingConv::ID CallConv, 1614 const SmallVectorImpl<ISD::InputArg> &Ins, 1615 DebugLoc dl, SelectionDAG &DAG, 1616 const CCValAssign &VA, 1617 MachineFrameInfo *MFI, 1618 unsigned i) const { 1619 // Create the nodes corresponding to a load from this parameter slot. 1620 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1621 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1622 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1623 EVT ValVT; 1624 1625 // If value is passed by pointer we have address passed instead of the value 1626 // itself. 1627 if (VA.getLocInfo() == CCValAssign::Indirect) 1628 ValVT = VA.getLocVT(); 1629 else 1630 ValVT = VA.getValVT(); 1631 1632 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1633 // changed with more analysis. 1634 // In case of tail call optimization mark all arguments mutable. Since they 1635 // could be overwritten by lowering of arguments in case of a tail call. 1636 if (Flags.isByVal()) { 1637 unsigned Bytes = Flags.getByValSize(); 1638 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1639 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1640 return DAG.getFrameIndex(FI, getPointerTy()); 1641 } else { 1642 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1643 VA.getLocMemOffset(), isImmutable); 1644 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1645 return DAG.getLoad(ValVT, dl, Chain, FIN, 1646 MachinePointerInfo::getFixedStack(FI), 1647 false, false, 0); 1648 } 1649} 1650 1651SDValue 1652X86TargetLowering::LowerFormalArguments(SDValue Chain, 1653 CallingConv::ID CallConv, 1654 bool isVarArg, 1655 const SmallVectorImpl<ISD::InputArg> &Ins, 1656 DebugLoc dl, 1657 SelectionDAG &DAG, 1658 SmallVectorImpl<SDValue> &InVals) 1659 const { 1660 MachineFunction &MF = DAG.getMachineFunction(); 1661 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1662 1663 const Function* Fn = MF.getFunction(); 1664 if (Fn->hasExternalLinkage() && 1665 Subtarget->isTargetCygMing() && 1666 Fn->getName() == "main") 1667 FuncInfo->setForceFramePointer(true); 1668 1669 MachineFrameInfo *MFI = MF.getFrameInfo(); 1670 bool Is64Bit = Subtarget->is64Bit(); 1671 bool IsWin64 = Subtarget->isTargetWin64(); 1672 1673 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1674 "Var args not supported with calling convention fastcc or ghc"); 1675 1676 // Assign locations to all of the incoming arguments. 1677 SmallVector<CCValAssign, 16> ArgLocs; 1678 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1679 ArgLocs, *DAG.getContext()); 1680 1681 // Allocate shadow area for Win64 1682 if (IsWin64) { 1683 CCInfo.AllocateStack(32, 8); 1684 } 1685 1686 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1687 1688 unsigned LastVal = ~0U; 1689 SDValue ArgValue; 1690 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1691 CCValAssign &VA = ArgLocs[i]; 1692 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1693 // places. 1694 assert(VA.getValNo() != LastVal && 1695 "Don't support value assigned to multiple locs yet"); 1696 LastVal = VA.getValNo(); 1697 1698 if (VA.isRegLoc()) { 1699 EVT RegVT = VA.getLocVT(); 1700 TargetRegisterClass *RC = NULL; 1701 if (RegVT == MVT::i32) 1702 RC = X86::GR32RegisterClass; 1703 else if (Is64Bit && RegVT == MVT::i64) 1704 RC = X86::GR64RegisterClass; 1705 else if (RegVT == MVT::f32) 1706 RC = X86::FR32RegisterClass; 1707 else if (RegVT == MVT::f64) 1708 RC = X86::FR64RegisterClass; 1709 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1710 RC = X86::VR256RegisterClass; 1711 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1712 RC = X86::VR128RegisterClass; 1713 else if (RegVT == MVT::x86mmx) 1714 RC = X86::VR64RegisterClass; 1715 else 1716 llvm_unreachable("Unknown argument type!"); 1717 1718 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1719 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1720 1721 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1722 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1723 // right size. 1724 if (VA.getLocInfo() == CCValAssign::SExt) 1725 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1726 DAG.getValueType(VA.getValVT())); 1727 else if (VA.getLocInfo() == CCValAssign::ZExt) 1728 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1729 DAG.getValueType(VA.getValVT())); 1730 else if (VA.getLocInfo() == CCValAssign::BCvt) 1731 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1732 1733 if (VA.isExtInLoc()) { 1734 // Handle MMX values passed in XMM regs. 1735 if (RegVT.isVector()) { 1736 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1737 ArgValue); 1738 } else 1739 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1740 } 1741 } else { 1742 assert(VA.isMemLoc()); 1743 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1744 } 1745 1746 // If value is passed via pointer - do a load. 1747 if (VA.getLocInfo() == CCValAssign::Indirect) 1748 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1749 MachinePointerInfo(), false, false, 0); 1750 1751 InVals.push_back(ArgValue); 1752 } 1753 1754 // The x86-64 ABI for returning structs by value requires that we copy 1755 // the sret argument into %rax for the return. Save the argument into 1756 // a virtual register so that we can access it from the return points. 1757 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1758 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1759 unsigned Reg = FuncInfo->getSRetReturnReg(); 1760 if (!Reg) { 1761 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1762 FuncInfo->setSRetReturnReg(Reg); 1763 } 1764 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1765 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1766 } 1767 1768 unsigned StackSize = CCInfo.getNextStackOffset(); 1769 // Align stack specially for tail calls. 1770 if (FuncIsMadeTailCallSafe(CallConv)) 1771 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1772 1773 // If the function takes variable number of arguments, make a frame index for 1774 // the start of the first vararg value... for expansion of llvm.va_start. 1775 if (isVarArg) { 1776 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1777 CallConv != CallingConv::X86_ThisCall)) { 1778 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1779 } 1780 if (Is64Bit) { 1781 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1782 1783 // FIXME: We should really autogenerate these arrays 1784 static const unsigned GPR64ArgRegsWin64[] = { 1785 X86::RCX, X86::RDX, X86::R8, X86::R9 1786 }; 1787 static const unsigned GPR64ArgRegs64Bit[] = { 1788 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1789 }; 1790 static const unsigned XMMArgRegs64Bit[] = { 1791 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1792 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1793 }; 1794 const unsigned *GPR64ArgRegs; 1795 unsigned NumXMMRegs = 0; 1796 1797 if (IsWin64) { 1798 // The XMM registers which might contain var arg parameters are shadowed 1799 // in their paired GPR. So we only need to save the GPR to their home 1800 // slots. 1801 TotalNumIntRegs = 4; 1802 GPR64ArgRegs = GPR64ArgRegsWin64; 1803 } else { 1804 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1805 GPR64ArgRegs = GPR64ArgRegs64Bit; 1806 1807 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1808 } 1809 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1810 TotalNumIntRegs); 1811 1812 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1813 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1814 "SSE register cannot be used when SSE is disabled!"); 1815 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1816 "SSE register cannot be used when SSE is disabled!"); 1817 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1818 // Kernel mode asks for SSE to be disabled, so don't push them 1819 // on the stack. 1820 TotalNumXMMRegs = 0; 1821 1822 if (IsWin64) { 1823 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1824 // Get to the caller-allocated home save location. Add 8 to account 1825 // for the return address. 1826 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1827 FuncInfo->setRegSaveFrameIndex( 1828 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1829 // Fixup to set vararg frame on shadow area (4 x i64). 1830 if (NumIntRegs < 4) 1831 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1832 } else { 1833 // For X86-64, if there are vararg parameters that are passed via 1834 // registers, then we must store them to their spots on the stack so they 1835 // may be loaded by deferencing the result of va_next. 1836 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1837 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1838 FuncInfo->setRegSaveFrameIndex( 1839 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1840 false)); 1841 } 1842 1843 // Store the integer parameter registers. 1844 SmallVector<SDValue, 8> MemOps; 1845 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1846 getPointerTy()); 1847 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1848 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1849 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1850 DAG.getIntPtrConstant(Offset)); 1851 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1852 X86::GR64RegisterClass); 1853 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1854 SDValue Store = 1855 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1856 MachinePointerInfo::getFixedStack( 1857 FuncInfo->getRegSaveFrameIndex(), Offset), 1858 false, false, 0); 1859 MemOps.push_back(Store); 1860 Offset += 8; 1861 } 1862 1863 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1864 // Now store the XMM (fp + vector) parameter registers. 1865 SmallVector<SDValue, 11> SaveXMMOps; 1866 SaveXMMOps.push_back(Chain); 1867 1868 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1869 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1870 SaveXMMOps.push_back(ALVal); 1871 1872 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1873 FuncInfo->getRegSaveFrameIndex())); 1874 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1875 FuncInfo->getVarArgsFPOffset())); 1876 1877 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1878 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1879 X86::VR128RegisterClass); 1880 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1881 SaveXMMOps.push_back(Val); 1882 } 1883 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1884 MVT::Other, 1885 &SaveXMMOps[0], SaveXMMOps.size())); 1886 } 1887 1888 if (!MemOps.empty()) 1889 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1890 &MemOps[0], MemOps.size()); 1891 } 1892 } 1893 1894 // Some CCs need callee pop. 1895 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1896 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1897 } else { 1898 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1899 // If this is an sret function, the return should pop the hidden pointer. 1900 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1901 FuncInfo->setBytesToPopOnReturn(4); 1902 } 1903 1904 if (!Is64Bit) { 1905 // RegSaveFrameIndex is X86-64 only. 1906 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1907 if (CallConv == CallingConv::X86_FastCall || 1908 CallConv == CallingConv::X86_ThisCall) 1909 // fastcc functions can't have varargs. 1910 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1911 } 1912 1913 return Chain; 1914} 1915 1916SDValue 1917X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1918 SDValue StackPtr, SDValue Arg, 1919 DebugLoc dl, SelectionDAG &DAG, 1920 const CCValAssign &VA, 1921 ISD::ArgFlagsTy Flags) const { 1922 unsigned LocMemOffset = VA.getLocMemOffset(); 1923 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1924 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1925 if (Flags.isByVal()) 1926 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1927 1928 return DAG.getStore(Chain, dl, Arg, PtrOff, 1929 MachinePointerInfo::getStack(LocMemOffset), 1930 false, false, 0); 1931} 1932 1933/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1934/// optimization is performed and it is required. 1935SDValue 1936X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1937 SDValue &OutRetAddr, SDValue Chain, 1938 bool IsTailCall, bool Is64Bit, 1939 int FPDiff, DebugLoc dl) const { 1940 // Adjust the Return address stack slot. 1941 EVT VT = getPointerTy(); 1942 OutRetAddr = getReturnAddressFrameIndex(DAG); 1943 1944 // Load the "old" Return address. 1945 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1946 false, false, 0); 1947 return SDValue(OutRetAddr.getNode(), 1); 1948} 1949 1950/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1951/// optimization is performed and it is required (FPDiff!=0). 1952static SDValue 1953EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1954 SDValue Chain, SDValue RetAddrFrIdx, 1955 bool Is64Bit, int FPDiff, DebugLoc dl) { 1956 // Store the return address to the appropriate stack slot. 1957 if (!FPDiff) return Chain; 1958 // Calculate the new stack slot for the return address. 1959 int SlotSize = Is64Bit ? 8 : 4; 1960 int NewReturnAddrFI = 1961 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1962 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1963 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1964 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1965 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1966 false, false, 0); 1967 return Chain; 1968} 1969 1970SDValue 1971X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1972 CallingConv::ID CallConv, bool isVarArg, 1973 bool &isTailCall, 1974 const SmallVectorImpl<ISD::OutputArg> &Outs, 1975 const SmallVectorImpl<SDValue> &OutVals, 1976 const SmallVectorImpl<ISD::InputArg> &Ins, 1977 DebugLoc dl, SelectionDAG &DAG, 1978 SmallVectorImpl<SDValue> &InVals) const { 1979 MachineFunction &MF = DAG.getMachineFunction(); 1980 bool Is64Bit = Subtarget->is64Bit(); 1981 bool IsWin64 = Subtarget->isTargetWin64(); 1982 bool IsStructRet = CallIsStructReturn(Outs); 1983 bool IsSibcall = false; 1984 1985 if (isTailCall) { 1986 // Check if it's really possible to do a tail call. 1987 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1988 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1989 Outs, OutVals, Ins, DAG); 1990 1991 // Sibcalls are automatically detected tailcalls which do not require 1992 // ABI changes. 1993 if (!GuaranteedTailCallOpt && isTailCall) 1994 IsSibcall = true; 1995 1996 if (isTailCall) 1997 ++NumTailCalls; 1998 } 1999 2000 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2001 "Var args not supported with calling convention fastcc or ghc"); 2002 2003 // Analyze operands of the call, assigning locations to each operand. 2004 SmallVector<CCValAssign, 16> ArgLocs; 2005 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 2006 ArgLocs, *DAG.getContext()); 2007 2008 // Allocate shadow area for Win64 2009 if (IsWin64) { 2010 CCInfo.AllocateStack(32, 8); 2011 } 2012 2013 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2014 2015 // Get a count of how many bytes are to be pushed on the stack. 2016 unsigned NumBytes = CCInfo.getNextStackOffset(); 2017 if (IsSibcall) 2018 // This is a sibcall. The memory operands are available in caller's 2019 // own caller's stack. 2020 NumBytes = 0; 2021 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 2022 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2023 2024 int FPDiff = 0; 2025 if (isTailCall && !IsSibcall) { 2026 // Lower arguments at fp - stackoffset + fpdiff. 2027 unsigned NumBytesCallerPushed = 2028 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2029 FPDiff = NumBytesCallerPushed - NumBytes; 2030 2031 // Set the delta of movement of the returnaddr stackslot. 2032 // But only set if delta is greater than previous delta. 2033 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2034 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2035 } 2036 2037 if (!IsSibcall) 2038 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2039 2040 SDValue RetAddrFrIdx; 2041 // Load return address for tail calls. 2042 if (isTailCall && FPDiff) 2043 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2044 Is64Bit, FPDiff, dl); 2045 2046 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2047 SmallVector<SDValue, 8> MemOpChains; 2048 SDValue StackPtr; 2049 2050 // Walk the register/memloc assignments, inserting copies/loads. In the case 2051 // of tail call optimization arguments are handle later. 2052 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2053 CCValAssign &VA = ArgLocs[i]; 2054 EVT RegVT = VA.getLocVT(); 2055 SDValue Arg = OutVals[i]; 2056 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2057 bool isByVal = Flags.isByVal(); 2058 2059 // Promote the value if needed. 2060 switch (VA.getLocInfo()) { 2061 default: llvm_unreachable("Unknown loc info!"); 2062 case CCValAssign::Full: break; 2063 case CCValAssign::SExt: 2064 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2065 break; 2066 case CCValAssign::ZExt: 2067 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2068 break; 2069 case CCValAssign::AExt: 2070 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2071 // Special case: passing MMX values in XMM registers. 2072 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2073 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2074 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2075 } else 2076 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2077 break; 2078 case CCValAssign::BCvt: 2079 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2080 break; 2081 case CCValAssign::Indirect: { 2082 // Store the argument. 2083 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2084 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2085 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2086 MachinePointerInfo::getFixedStack(FI), 2087 false, false, 0); 2088 Arg = SpillSlot; 2089 break; 2090 } 2091 } 2092 2093 if (VA.isRegLoc()) { 2094 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2095 if (isVarArg && IsWin64) { 2096 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2097 // shadow reg if callee is a varargs function. 2098 unsigned ShadowReg = 0; 2099 switch (VA.getLocReg()) { 2100 case X86::XMM0: ShadowReg = X86::RCX; break; 2101 case X86::XMM1: ShadowReg = X86::RDX; break; 2102 case X86::XMM2: ShadowReg = X86::R8; break; 2103 case X86::XMM3: ShadowReg = X86::R9; break; 2104 } 2105 if (ShadowReg) 2106 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2107 } 2108 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2109 assert(VA.isMemLoc()); 2110 if (StackPtr.getNode() == 0) 2111 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2112 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2113 dl, DAG, VA, Flags)); 2114 } 2115 } 2116 2117 if (!MemOpChains.empty()) 2118 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2119 &MemOpChains[0], MemOpChains.size()); 2120 2121 // Build a sequence of copy-to-reg nodes chained together with token chain 2122 // and flag operands which copy the outgoing args into registers. 2123 SDValue InFlag; 2124 // Tail call byval lowering might overwrite argument registers so in case of 2125 // tail call optimization the copies to registers are lowered later. 2126 if (!isTailCall) 2127 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2128 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2129 RegsToPass[i].second, InFlag); 2130 InFlag = Chain.getValue(1); 2131 } 2132 2133 if (Subtarget->isPICStyleGOT()) { 2134 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2135 // GOT pointer. 2136 if (!isTailCall) { 2137 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2138 DAG.getNode(X86ISD::GlobalBaseReg, 2139 DebugLoc(), getPointerTy()), 2140 InFlag); 2141 InFlag = Chain.getValue(1); 2142 } else { 2143 // If we are tail calling and generating PIC/GOT style code load the 2144 // address of the callee into ECX. The value in ecx is used as target of 2145 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2146 // for tail calls on PIC/GOT architectures. Normally we would just put the 2147 // address of GOT into ebx and then call target@PLT. But for tail calls 2148 // ebx would be restored (since ebx is callee saved) before jumping to the 2149 // target@PLT. 2150 2151 // Note: The actual moving to ECX is done further down. 2152 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2153 if (G && !G->getGlobal()->hasHiddenVisibility() && 2154 !G->getGlobal()->hasProtectedVisibility()) 2155 Callee = LowerGlobalAddress(Callee, DAG); 2156 else if (isa<ExternalSymbolSDNode>(Callee)) 2157 Callee = LowerExternalSymbol(Callee, DAG); 2158 } 2159 } 2160 2161 if (Is64Bit && isVarArg && !IsWin64) { 2162 // From AMD64 ABI document: 2163 // For calls that may call functions that use varargs or stdargs 2164 // (prototype-less calls or calls to functions containing ellipsis (...) in 2165 // the declaration) %al is used as hidden argument to specify the number 2166 // of SSE registers used. The contents of %al do not need to match exactly 2167 // the number of registers, but must be an ubound on the number of SSE 2168 // registers used and is in the range 0 - 8 inclusive. 2169 2170 // Count the number of XMM registers allocated. 2171 static const unsigned XMMArgRegs[] = { 2172 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2173 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2174 }; 2175 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2176 assert((Subtarget->hasXMM() || !NumXMMRegs) 2177 && "SSE registers cannot be used when SSE is disabled"); 2178 2179 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2180 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2181 InFlag = Chain.getValue(1); 2182 } 2183 2184 2185 // For tail calls lower the arguments to the 'real' stack slot. 2186 if (isTailCall) { 2187 // Force all the incoming stack arguments to be loaded from the stack 2188 // before any new outgoing arguments are stored to the stack, because the 2189 // outgoing stack slots may alias the incoming argument stack slots, and 2190 // the alias isn't otherwise explicit. This is slightly more conservative 2191 // than necessary, because it means that each store effectively depends 2192 // on every argument instead of just those arguments it would clobber. 2193 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2194 2195 SmallVector<SDValue, 8> MemOpChains2; 2196 SDValue FIN; 2197 int FI = 0; 2198 // Do not flag preceding copytoreg stuff together with the following stuff. 2199 InFlag = SDValue(); 2200 if (GuaranteedTailCallOpt) { 2201 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2202 CCValAssign &VA = ArgLocs[i]; 2203 if (VA.isRegLoc()) 2204 continue; 2205 assert(VA.isMemLoc()); 2206 SDValue Arg = OutVals[i]; 2207 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2208 // Create frame index. 2209 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2210 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2211 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2212 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2213 2214 if (Flags.isByVal()) { 2215 // Copy relative to framepointer. 2216 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2217 if (StackPtr.getNode() == 0) 2218 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2219 getPointerTy()); 2220 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2221 2222 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2223 ArgChain, 2224 Flags, DAG, dl)); 2225 } else { 2226 // Store relative to framepointer. 2227 MemOpChains2.push_back( 2228 DAG.getStore(ArgChain, dl, Arg, FIN, 2229 MachinePointerInfo::getFixedStack(FI), 2230 false, false, 0)); 2231 } 2232 } 2233 } 2234 2235 if (!MemOpChains2.empty()) 2236 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2237 &MemOpChains2[0], MemOpChains2.size()); 2238 2239 // Copy arguments to their registers. 2240 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2241 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2242 RegsToPass[i].second, InFlag); 2243 InFlag = Chain.getValue(1); 2244 } 2245 InFlag =SDValue(); 2246 2247 // Store the return address to the appropriate stack slot. 2248 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2249 FPDiff, dl); 2250 } 2251 2252 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2253 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2254 // In the 64-bit large code model, we have to make all calls 2255 // through a register, since the call instruction's 32-bit 2256 // pc-relative offset may not be large enough to hold the whole 2257 // address. 2258 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2259 // If the callee is a GlobalAddress node (quite common, every direct call 2260 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2261 // it. 2262 2263 // We should use extra load for direct calls to dllimported functions in 2264 // non-JIT mode. 2265 const GlobalValue *GV = G->getGlobal(); 2266 if (!GV->hasDLLImportLinkage()) { 2267 unsigned char OpFlags = 0; 2268 2269 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2270 // external symbols most go through the PLT in PIC mode. If the symbol 2271 // has hidden or protected visibility, or if it is static or local, then 2272 // we don't need to use the PLT - we can directly call it. 2273 if (Subtarget->isTargetELF() && 2274 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2275 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2276 OpFlags = X86II::MO_PLT; 2277 } else if (Subtarget->isPICStyleStubAny() && 2278 (GV->isDeclaration() || GV->isWeakForLinker()) && 2279 (!Subtarget->getTargetTriple().isMacOSX() || 2280 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2281 // PC-relative references to external symbols should go through $stub, 2282 // unless we're building with the leopard linker or later, which 2283 // automatically synthesizes these stubs. 2284 OpFlags = X86II::MO_DARWIN_STUB; 2285 } 2286 2287 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2288 G->getOffset(), OpFlags); 2289 } 2290 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2291 unsigned char OpFlags = 0; 2292 2293 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2294 // external symbols should go through the PLT. 2295 if (Subtarget->isTargetELF() && 2296 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2297 OpFlags = X86II::MO_PLT; 2298 } else if (Subtarget->isPICStyleStubAny() && 2299 (!Subtarget->getTargetTriple().isMacOSX() || 2300 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2301 // PC-relative references to external symbols should go through $stub, 2302 // unless we're building with the leopard linker or later, which 2303 // automatically synthesizes these stubs. 2304 OpFlags = X86II::MO_DARWIN_STUB; 2305 } 2306 2307 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2308 OpFlags); 2309 } 2310 2311 // Returns a chain & a flag for retval copy to use. 2312 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2313 SmallVector<SDValue, 8> Ops; 2314 2315 if (!IsSibcall && isTailCall) { 2316 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2317 DAG.getIntPtrConstant(0, true), InFlag); 2318 InFlag = Chain.getValue(1); 2319 } 2320 2321 Ops.push_back(Chain); 2322 Ops.push_back(Callee); 2323 2324 if (isTailCall) 2325 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2326 2327 // Add argument registers to the end of the list so that they are known live 2328 // into the call. 2329 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2330 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2331 RegsToPass[i].second.getValueType())); 2332 2333 // Add an implicit use GOT pointer in EBX. 2334 if (!isTailCall && Subtarget->isPICStyleGOT()) 2335 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2336 2337 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2338 if (Is64Bit && isVarArg && !IsWin64) 2339 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2340 2341 if (InFlag.getNode()) 2342 Ops.push_back(InFlag); 2343 2344 if (isTailCall) { 2345 // We used to do: 2346 //// If this is the first return lowered for this function, add the regs 2347 //// to the liveout set for the function. 2348 // This isn't right, although it's probably harmless on x86; liveouts 2349 // should be computed from returns not tail calls. Consider a void 2350 // function making a tail call to a function returning int. 2351 return DAG.getNode(X86ISD::TC_RETURN, dl, 2352 NodeTys, &Ops[0], Ops.size()); 2353 } 2354 2355 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2356 InFlag = Chain.getValue(1); 2357 2358 // Create the CALLSEQ_END node. 2359 unsigned NumBytesForCalleeToPush; 2360 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2361 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2362 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2363 // If this is a call to a struct-return function, the callee 2364 // pops the hidden struct pointer, so we have to push it back. 2365 // This is common for Darwin/X86, Linux & Mingw32 targets. 2366 NumBytesForCalleeToPush = 4; 2367 else 2368 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2369 2370 // Returns a flag for retval copy to use. 2371 if (!IsSibcall) { 2372 Chain = DAG.getCALLSEQ_END(Chain, 2373 DAG.getIntPtrConstant(NumBytes, true), 2374 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2375 true), 2376 InFlag); 2377 InFlag = Chain.getValue(1); 2378 } 2379 2380 // Handle result values, copying them out of physregs into vregs that we 2381 // return. 2382 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2383 Ins, dl, DAG, InVals); 2384} 2385 2386 2387//===----------------------------------------------------------------------===// 2388// Fast Calling Convention (tail call) implementation 2389//===----------------------------------------------------------------------===// 2390 2391// Like std call, callee cleans arguments, convention except that ECX is 2392// reserved for storing the tail called function address. Only 2 registers are 2393// free for argument passing (inreg). Tail call optimization is performed 2394// provided: 2395// * tailcallopt is enabled 2396// * caller/callee are fastcc 2397// On X86_64 architecture with GOT-style position independent code only local 2398// (within module) calls are supported at the moment. 2399// To keep the stack aligned according to platform abi the function 2400// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2401// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2402// If a tail called function callee has more arguments than the caller the 2403// caller needs to make sure that there is room to move the RETADDR to. This is 2404// achieved by reserving an area the size of the argument delta right after the 2405// original REtADDR, but before the saved framepointer or the spilled registers 2406// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2407// stack layout: 2408// arg1 2409// arg2 2410// RETADDR 2411// [ new RETADDR 2412// move area ] 2413// (possible EBP) 2414// ESI 2415// EDI 2416// local1 .. 2417 2418/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2419/// for a 16 byte align requirement. 2420unsigned 2421X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2422 SelectionDAG& DAG) const { 2423 MachineFunction &MF = DAG.getMachineFunction(); 2424 const TargetMachine &TM = MF.getTarget(); 2425 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2426 unsigned StackAlignment = TFI.getStackAlignment(); 2427 uint64_t AlignMask = StackAlignment - 1; 2428 int64_t Offset = StackSize; 2429 uint64_t SlotSize = TD->getPointerSize(); 2430 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2431 // Number smaller than 12 so just add the difference. 2432 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2433 } else { 2434 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2435 Offset = ((~AlignMask) & Offset) + StackAlignment + 2436 (StackAlignment-SlotSize); 2437 } 2438 return Offset; 2439} 2440 2441/// MatchingStackOffset - Return true if the given stack call argument is 2442/// already available in the same position (relatively) of the caller's 2443/// incoming argument stack. 2444static 2445bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2446 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2447 const X86InstrInfo *TII) { 2448 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2449 int FI = INT_MAX; 2450 if (Arg.getOpcode() == ISD::CopyFromReg) { 2451 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2452 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2453 return false; 2454 MachineInstr *Def = MRI->getVRegDef(VR); 2455 if (!Def) 2456 return false; 2457 if (!Flags.isByVal()) { 2458 if (!TII->isLoadFromStackSlot(Def, FI)) 2459 return false; 2460 } else { 2461 unsigned Opcode = Def->getOpcode(); 2462 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2463 Def->getOperand(1).isFI()) { 2464 FI = Def->getOperand(1).getIndex(); 2465 Bytes = Flags.getByValSize(); 2466 } else 2467 return false; 2468 } 2469 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2470 if (Flags.isByVal()) 2471 // ByVal argument is passed in as a pointer but it's now being 2472 // dereferenced. e.g. 2473 // define @foo(%struct.X* %A) { 2474 // tail call @bar(%struct.X* byval %A) 2475 // } 2476 return false; 2477 SDValue Ptr = Ld->getBasePtr(); 2478 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2479 if (!FINode) 2480 return false; 2481 FI = FINode->getIndex(); 2482 } else 2483 return false; 2484 2485 assert(FI != INT_MAX); 2486 if (!MFI->isFixedObjectIndex(FI)) 2487 return false; 2488 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2489} 2490 2491/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2492/// for tail call optimization. Targets which want to do tail call 2493/// optimization should implement this function. 2494bool 2495X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2496 CallingConv::ID CalleeCC, 2497 bool isVarArg, 2498 bool isCalleeStructRet, 2499 bool isCallerStructRet, 2500 const SmallVectorImpl<ISD::OutputArg> &Outs, 2501 const SmallVectorImpl<SDValue> &OutVals, 2502 const SmallVectorImpl<ISD::InputArg> &Ins, 2503 SelectionDAG& DAG) const { 2504 if (!IsTailCallConvention(CalleeCC) && 2505 CalleeCC != CallingConv::C) 2506 return false; 2507 2508 // If -tailcallopt is specified, make fastcc functions tail-callable. 2509 const MachineFunction &MF = DAG.getMachineFunction(); 2510 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2511 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2512 bool CCMatch = CallerCC == CalleeCC; 2513 2514 if (GuaranteedTailCallOpt) { 2515 if (IsTailCallConvention(CalleeCC) && CCMatch) 2516 return true; 2517 return false; 2518 } 2519 2520 // Look for obvious safe cases to perform tail call optimization that do not 2521 // require ABI changes. This is what gcc calls sibcall. 2522 2523 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2524 // emit a special epilogue. 2525 if (RegInfo->needsStackRealignment(MF)) 2526 return false; 2527 2528 // Also avoid sibcall optimization if either caller or callee uses struct 2529 // return semantics. 2530 if (isCalleeStructRet || isCallerStructRet) 2531 return false; 2532 2533 // Do not sibcall optimize vararg calls unless all arguments are passed via 2534 // registers 2535 if (isVarArg && !Outs.empty()) { 2536 SmallVector<CCValAssign, 16> ArgLocs; 2537 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2538 ArgLocs, *DAG.getContext()); 2539 2540 // Allocate shadow area for Win64 2541 if (Subtarget->isTargetWin64()) { 2542 CCInfo.AllocateStack(32, 8); 2543 } 2544 2545 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2546 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2547 if (!ArgLocs[i].isRegLoc()) 2548 return false; 2549 } 2550 2551 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2552 // Therefore if it's not used by the call it is not safe to optimize this into 2553 // a sibcall. 2554 bool Unused = false; 2555 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2556 if (!Ins[i].Used) { 2557 Unused = true; 2558 break; 2559 } 2560 } 2561 if (Unused) { 2562 SmallVector<CCValAssign, 16> RVLocs; 2563 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2564 RVLocs, *DAG.getContext()); 2565 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2566 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2567 CCValAssign &VA = RVLocs[i]; 2568 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2569 return false; 2570 } 2571 } 2572 2573 // If the calling conventions do not match, then we'd better make sure the 2574 // results are returned in the same way as what the caller expects. 2575 if (!CCMatch) { 2576 SmallVector<CCValAssign, 16> RVLocs1; 2577 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2578 RVLocs1, *DAG.getContext()); 2579 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2580 2581 SmallVector<CCValAssign, 16> RVLocs2; 2582 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2583 RVLocs2, *DAG.getContext()); 2584 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2585 2586 if (RVLocs1.size() != RVLocs2.size()) 2587 return false; 2588 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2589 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2590 return false; 2591 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2592 return false; 2593 if (RVLocs1[i].isRegLoc()) { 2594 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2595 return false; 2596 } else { 2597 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2598 return false; 2599 } 2600 } 2601 } 2602 2603 // If the callee takes no arguments then go on to check the results of the 2604 // call. 2605 if (!Outs.empty()) { 2606 // Check if stack adjustment is needed. For now, do not do this if any 2607 // argument is passed on the stack. 2608 SmallVector<CCValAssign, 16> ArgLocs; 2609 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2610 ArgLocs, *DAG.getContext()); 2611 2612 // Allocate shadow area for Win64 2613 if (Subtarget->isTargetWin64()) { 2614 CCInfo.AllocateStack(32, 8); 2615 } 2616 2617 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2618 if (CCInfo.getNextStackOffset()) { 2619 MachineFunction &MF = DAG.getMachineFunction(); 2620 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2621 return false; 2622 2623 // Check if the arguments are already laid out in the right way as 2624 // the caller's fixed stack objects. 2625 MachineFrameInfo *MFI = MF.getFrameInfo(); 2626 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2627 const X86InstrInfo *TII = 2628 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2629 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2630 CCValAssign &VA = ArgLocs[i]; 2631 SDValue Arg = OutVals[i]; 2632 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2633 if (VA.getLocInfo() == CCValAssign::Indirect) 2634 return false; 2635 if (!VA.isRegLoc()) { 2636 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2637 MFI, MRI, TII)) 2638 return false; 2639 } 2640 } 2641 } 2642 2643 // If the tailcall address may be in a register, then make sure it's 2644 // possible to register allocate for it. In 32-bit, the call address can 2645 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2646 // callee-saved registers are restored. These happen to be the same 2647 // registers used to pass 'inreg' arguments so watch out for those. 2648 if (!Subtarget->is64Bit() && 2649 !isa<GlobalAddressSDNode>(Callee) && 2650 !isa<ExternalSymbolSDNode>(Callee)) { 2651 unsigned NumInRegs = 0; 2652 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2653 CCValAssign &VA = ArgLocs[i]; 2654 if (!VA.isRegLoc()) 2655 continue; 2656 unsigned Reg = VA.getLocReg(); 2657 switch (Reg) { 2658 default: break; 2659 case X86::EAX: case X86::EDX: case X86::ECX: 2660 if (++NumInRegs == 3) 2661 return false; 2662 break; 2663 } 2664 } 2665 } 2666 } 2667 2668 // An stdcall caller is expected to clean up its arguments; the callee 2669 // isn't going to do that. 2670 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2671 return false; 2672 2673 return true; 2674} 2675 2676FastISel * 2677X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2678 return X86::createFastISel(funcInfo); 2679} 2680 2681 2682//===----------------------------------------------------------------------===// 2683// Other Lowering Hooks 2684//===----------------------------------------------------------------------===// 2685 2686static bool MayFoldLoad(SDValue Op) { 2687 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2688} 2689 2690static bool MayFoldIntoStore(SDValue Op) { 2691 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2692} 2693 2694static bool isTargetShuffle(unsigned Opcode) { 2695 switch(Opcode) { 2696 default: return false; 2697 case X86ISD::PSHUFD: 2698 case X86ISD::PSHUFHW: 2699 case X86ISD::PSHUFLW: 2700 case X86ISD::SHUFPD: 2701 case X86ISD::PALIGN: 2702 case X86ISD::SHUFPS: 2703 case X86ISD::MOVLHPS: 2704 case X86ISD::MOVLHPD: 2705 case X86ISD::MOVHLPS: 2706 case X86ISD::MOVLPS: 2707 case X86ISD::MOVLPD: 2708 case X86ISD::MOVSHDUP: 2709 case X86ISD::MOVSLDUP: 2710 case X86ISD::MOVDDUP: 2711 case X86ISD::MOVSS: 2712 case X86ISD::MOVSD: 2713 case X86ISD::UNPCKLPS: 2714 case X86ISD::UNPCKLPD: 2715 case X86ISD::VUNPCKLPS: 2716 case X86ISD::VUNPCKLPD: 2717 case X86ISD::VUNPCKLPSY: 2718 case X86ISD::VUNPCKLPDY: 2719 case X86ISD::PUNPCKLWD: 2720 case X86ISD::PUNPCKLBW: 2721 case X86ISD::PUNPCKLDQ: 2722 case X86ISD::PUNPCKLQDQ: 2723 case X86ISD::UNPCKHPS: 2724 case X86ISD::UNPCKHPD: 2725 case X86ISD::PUNPCKHWD: 2726 case X86ISD::PUNPCKHBW: 2727 case X86ISD::PUNPCKHDQ: 2728 case X86ISD::PUNPCKHQDQ: 2729 return true; 2730 } 2731 return false; 2732} 2733 2734static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2735 SDValue V1, SelectionDAG &DAG) { 2736 switch(Opc) { 2737 default: llvm_unreachable("Unknown x86 shuffle node"); 2738 case X86ISD::MOVSHDUP: 2739 case X86ISD::MOVSLDUP: 2740 case X86ISD::MOVDDUP: 2741 return DAG.getNode(Opc, dl, VT, V1); 2742 } 2743 2744 return SDValue(); 2745} 2746 2747static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2748 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2749 switch(Opc) { 2750 default: llvm_unreachable("Unknown x86 shuffle node"); 2751 case X86ISD::PSHUFD: 2752 case X86ISD::PSHUFHW: 2753 case X86ISD::PSHUFLW: 2754 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2755 } 2756 2757 return SDValue(); 2758} 2759 2760static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2761 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2762 switch(Opc) { 2763 default: llvm_unreachable("Unknown x86 shuffle node"); 2764 case X86ISD::PALIGN: 2765 case X86ISD::SHUFPD: 2766 case X86ISD::SHUFPS: 2767 return DAG.getNode(Opc, dl, VT, V1, V2, 2768 DAG.getConstant(TargetMask, MVT::i8)); 2769 } 2770 return SDValue(); 2771} 2772 2773static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2774 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2775 switch(Opc) { 2776 default: llvm_unreachable("Unknown x86 shuffle node"); 2777 case X86ISD::MOVLHPS: 2778 case X86ISD::MOVLHPD: 2779 case X86ISD::MOVHLPS: 2780 case X86ISD::MOVLPS: 2781 case X86ISD::MOVLPD: 2782 case X86ISD::MOVSS: 2783 case X86ISD::MOVSD: 2784 case X86ISD::UNPCKLPS: 2785 case X86ISD::UNPCKLPD: 2786 case X86ISD::VUNPCKLPS: 2787 case X86ISD::VUNPCKLPD: 2788 case X86ISD::VUNPCKLPSY: 2789 case X86ISD::VUNPCKLPDY: 2790 case X86ISD::PUNPCKLWD: 2791 case X86ISD::PUNPCKLBW: 2792 case X86ISD::PUNPCKLDQ: 2793 case X86ISD::PUNPCKLQDQ: 2794 case X86ISD::UNPCKHPS: 2795 case X86ISD::UNPCKHPD: 2796 case X86ISD::PUNPCKHWD: 2797 case X86ISD::PUNPCKHBW: 2798 case X86ISD::PUNPCKHDQ: 2799 case X86ISD::PUNPCKHQDQ: 2800 return DAG.getNode(Opc, dl, VT, V1, V2); 2801 } 2802 return SDValue(); 2803} 2804 2805SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2806 MachineFunction &MF = DAG.getMachineFunction(); 2807 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2808 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2809 2810 if (ReturnAddrIndex == 0) { 2811 // Set up a frame object for the return address. 2812 uint64_t SlotSize = TD->getPointerSize(); 2813 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2814 false); 2815 FuncInfo->setRAIndex(ReturnAddrIndex); 2816 } 2817 2818 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2819} 2820 2821 2822bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2823 bool hasSymbolicDisplacement) { 2824 // Offset should fit into 32 bit immediate field. 2825 if (!isInt<32>(Offset)) 2826 return false; 2827 2828 // If we don't have a symbolic displacement - we don't have any extra 2829 // restrictions. 2830 if (!hasSymbolicDisplacement) 2831 return true; 2832 2833 // FIXME: Some tweaks might be needed for medium code model. 2834 if (M != CodeModel::Small && M != CodeModel::Kernel) 2835 return false; 2836 2837 // For small code model we assume that latest object is 16MB before end of 31 2838 // bits boundary. We may also accept pretty large negative constants knowing 2839 // that all objects are in the positive half of address space. 2840 if (M == CodeModel::Small && Offset < 16*1024*1024) 2841 return true; 2842 2843 // For kernel code model we know that all object resist in the negative half 2844 // of 32bits address space. We may not accept negative offsets, since they may 2845 // be just off and we may accept pretty large positive ones. 2846 if (M == CodeModel::Kernel && Offset > 0) 2847 return true; 2848 2849 return false; 2850} 2851 2852/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2853/// specific condition code, returning the condition code and the LHS/RHS of the 2854/// comparison to make. 2855static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2856 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2857 if (!isFP) { 2858 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2859 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2860 // X > -1 -> X == 0, jump !sign. 2861 RHS = DAG.getConstant(0, RHS.getValueType()); 2862 return X86::COND_NS; 2863 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2864 // X < 0 -> X == 0, jump on sign. 2865 return X86::COND_S; 2866 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2867 // X < 1 -> X <= 0 2868 RHS = DAG.getConstant(0, RHS.getValueType()); 2869 return X86::COND_LE; 2870 } 2871 } 2872 2873 switch (SetCCOpcode) { 2874 default: llvm_unreachable("Invalid integer condition!"); 2875 case ISD::SETEQ: return X86::COND_E; 2876 case ISD::SETGT: return X86::COND_G; 2877 case ISD::SETGE: return X86::COND_GE; 2878 case ISD::SETLT: return X86::COND_L; 2879 case ISD::SETLE: return X86::COND_LE; 2880 case ISD::SETNE: return X86::COND_NE; 2881 case ISD::SETULT: return X86::COND_B; 2882 case ISD::SETUGT: return X86::COND_A; 2883 case ISD::SETULE: return X86::COND_BE; 2884 case ISD::SETUGE: return X86::COND_AE; 2885 } 2886 } 2887 2888 // First determine if it is required or is profitable to flip the operands. 2889 2890 // If LHS is a foldable load, but RHS is not, flip the condition. 2891 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2892 !ISD::isNON_EXTLoad(RHS.getNode())) { 2893 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2894 std::swap(LHS, RHS); 2895 } 2896 2897 switch (SetCCOpcode) { 2898 default: break; 2899 case ISD::SETOLT: 2900 case ISD::SETOLE: 2901 case ISD::SETUGT: 2902 case ISD::SETUGE: 2903 std::swap(LHS, RHS); 2904 break; 2905 } 2906 2907 // On a floating point condition, the flags are set as follows: 2908 // ZF PF CF op 2909 // 0 | 0 | 0 | X > Y 2910 // 0 | 0 | 1 | X < Y 2911 // 1 | 0 | 0 | X == Y 2912 // 1 | 1 | 1 | unordered 2913 switch (SetCCOpcode) { 2914 default: llvm_unreachable("Condcode should be pre-legalized away"); 2915 case ISD::SETUEQ: 2916 case ISD::SETEQ: return X86::COND_E; 2917 case ISD::SETOLT: // flipped 2918 case ISD::SETOGT: 2919 case ISD::SETGT: return X86::COND_A; 2920 case ISD::SETOLE: // flipped 2921 case ISD::SETOGE: 2922 case ISD::SETGE: return X86::COND_AE; 2923 case ISD::SETUGT: // flipped 2924 case ISD::SETULT: 2925 case ISD::SETLT: return X86::COND_B; 2926 case ISD::SETUGE: // flipped 2927 case ISD::SETULE: 2928 case ISD::SETLE: return X86::COND_BE; 2929 case ISD::SETONE: 2930 case ISD::SETNE: return X86::COND_NE; 2931 case ISD::SETUO: return X86::COND_P; 2932 case ISD::SETO: return X86::COND_NP; 2933 case ISD::SETOEQ: 2934 case ISD::SETUNE: return X86::COND_INVALID; 2935 } 2936} 2937 2938/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2939/// code. Current x86 isa includes the following FP cmov instructions: 2940/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2941static bool hasFPCMov(unsigned X86CC) { 2942 switch (X86CC) { 2943 default: 2944 return false; 2945 case X86::COND_B: 2946 case X86::COND_BE: 2947 case X86::COND_E: 2948 case X86::COND_P: 2949 case X86::COND_A: 2950 case X86::COND_AE: 2951 case X86::COND_NE: 2952 case X86::COND_NP: 2953 return true; 2954 } 2955} 2956 2957/// isFPImmLegal - Returns true if the target can instruction select the 2958/// specified FP immediate natively. If false, the legalizer will 2959/// materialize the FP immediate as a load from a constant pool. 2960bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2961 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2962 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2963 return true; 2964 } 2965 return false; 2966} 2967 2968/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2969/// the specified range (L, H]. 2970static bool isUndefOrInRange(int Val, int Low, int Hi) { 2971 return (Val < 0) || (Val >= Low && Val < Hi); 2972} 2973 2974/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2975/// specified value. 2976static bool isUndefOrEqual(int Val, int CmpVal) { 2977 if (Val < 0 || Val == CmpVal) 2978 return true; 2979 return false; 2980} 2981 2982/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2983/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2984/// the second operand. 2985static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2986 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2987 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2988 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2989 return (Mask[0] < 2 && Mask[1] < 2); 2990 return false; 2991} 2992 2993bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2994 SmallVector<int, 8> M; 2995 N->getMask(M); 2996 return ::isPSHUFDMask(M, N->getValueType(0)); 2997} 2998 2999/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3000/// is suitable for input to PSHUFHW. 3001static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3002 if (VT != MVT::v8i16) 3003 return false; 3004 3005 // Lower quadword copied in order or undef. 3006 for (int i = 0; i != 4; ++i) 3007 if (Mask[i] >= 0 && Mask[i] != i) 3008 return false; 3009 3010 // Upper quadword shuffled. 3011 for (int i = 4; i != 8; ++i) 3012 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3013 return false; 3014 3015 return true; 3016} 3017 3018bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3019 SmallVector<int, 8> M; 3020 N->getMask(M); 3021 return ::isPSHUFHWMask(M, N->getValueType(0)); 3022} 3023 3024/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3025/// is suitable for input to PSHUFLW. 3026static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3027 if (VT != MVT::v8i16) 3028 return false; 3029 3030 // Upper quadword copied in order. 3031 for (int i = 4; i != 8; ++i) 3032 if (Mask[i] >= 0 && Mask[i] != i) 3033 return false; 3034 3035 // Lower quadword shuffled. 3036 for (int i = 0; i != 4; ++i) 3037 if (Mask[i] >= 4) 3038 return false; 3039 3040 return true; 3041} 3042 3043bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3044 SmallVector<int, 8> M; 3045 N->getMask(M); 3046 return ::isPSHUFLWMask(M, N->getValueType(0)); 3047} 3048 3049/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3050/// is suitable for input to PALIGNR. 3051static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3052 bool hasSSSE3) { 3053 int i, e = VT.getVectorNumElements(); 3054 3055 // Do not handle v2i64 / v2f64 shuffles with palignr. 3056 if (e < 4 || !hasSSSE3) 3057 return false; 3058 3059 for (i = 0; i != e; ++i) 3060 if (Mask[i] >= 0) 3061 break; 3062 3063 // All undef, not a palignr. 3064 if (i == e) 3065 return false; 3066 3067 // Determine if it's ok to perform a palignr with only the LHS, since we 3068 // don't have access to the actual shuffle elements to see if RHS is undef. 3069 bool Unary = Mask[i] < (int)e; 3070 bool NeedsUnary = false; 3071 3072 int s = Mask[i] - i; 3073 3074 // Check the rest of the elements to see if they are consecutive. 3075 for (++i; i != e; ++i) { 3076 int m = Mask[i]; 3077 if (m < 0) 3078 continue; 3079 3080 Unary = Unary && (m < (int)e); 3081 NeedsUnary = NeedsUnary || (m < s); 3082 3083 if (NeedsUnary && !Unary) 3084 return false; 3085 if (Unary && m != ((s+i) & (e-1))) 3086 return false; 3087 if (!Unary && m != (s+i)) 3088 return false; 3089 } 3090 return true; 3091} 3092 3093bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 3094 SmallVector<int, 8> M; 3095 N->getMask(M); 3096 return ::isPALIGNRMask(M, N->getValueType(0), true); 3097} 3098 3099/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3100/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3101static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3102 int NumElems = VT.getVectorNumElements(); 3103 if (NumElems != 2 && NumElems != 4) 3104 return false; 3105 3106 int Half = NumElems / 2; 3107 for (int i = 0; i < Half; ++i) 3108 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3109 return false; 3110 for (int i = Half; i < NumElems; ++i) 3111 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3112 return false; 3113 3114 return true; 3115} 3116 3117bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3118 SmallVector<int, 8> M; 3119 N->getMask(M); 3120 return ::isSHUFPMask(M, N->getValueType(0)); 3121} 3122 3123/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3124/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3125/// half elements to come from vector 1 (which would equal the dest.) and 3126/// the upper half to come from vector 2. 3127static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3128 int NumElems = VT.getVectorNumElements(); 3129 3130 if (NumElems != 2 && NumElems != 4) 3131 return false; 3132 3133 int Half = NumElems / 2; 3134 for (int i = 0; i < Half; ++i) 3135 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3136 return false; 3137 for (int i = Half; i < NumElems; ++i) 3138 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3139 return false; 3140 return true; 3141} 3142 3143static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3144 SmallVector<int, 8> M; 3145 N->getMask(M); 3146 return isCommutedSHUFPMask(M, N->getValueType(0)); 3147} 3148 3149/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3150/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3151bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3152 if (N->getValueType(0).getVectorNumElements() != 4) 3153 return false; 3154 3155 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3156 return isUndefOrEqual(N->getMaskElt(0), 6) && 3157 isUndefOrEqual(N->getMaskElt(1), 7) && 3158 isUndefOrEqual(N->getMaskElt(2), 2) && 3159 isUndefOrEqual(N->getMaskElt(3), 3); 3160} 3161 3162/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3163/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3164/// <2, 3, 2, 3> 3165bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3166 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3167 3168 if (NumElems != 4) 3169 return false; 3170 3171 return isUndefOrEqual(N->getMaskElt(0), 2) && 3172 isUndefOrEqual(N->getMaskElt(1), 3) && 3173 isUndefOrEqual(N->getMaskElt(2), 2) && 3174 isUndefOrEqual(N->getMaskElt(3), 3); 3175} 3176 3177/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3178/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3179bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3180 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3181 3182 if (NumElems != 2 && NumElems != 4) 3183 return false; 3184 3185 for (unsigned i = 0; i < NumElems/2; ++i) 3186 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3187 return false; 3188 3189 for (unsigned i = NumElems/2; i < NumElems; ++i) 3190 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3191 return false; 3192 3193 return true; 3194} 3195 3196/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3197/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3198bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3199 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3200 3201 if ((NumElems != 2 && NumElems != 4) 3202 || N->getValueType(0).getSizeInBits() > 128) 3203 return false; 3204 3205 for (unsigned i = 0; i < NumElems/2; ++i) 3206 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3207 return false; 3208 3209 for (unsigned i = 0; i < NumElems/2; ++i) 3210 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3211 return false; 3212 3213 return true; 3214} 3215 3216/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3217/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3218static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3219 bool V2IsSplat = false) { 3220 int NumElts = VT.getVectorNumElements(); 3221 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3222 return false; 3223 3224 // Handle vector lengths > 128 bits. Define a "section" as a set of 3225 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit 3226 // sections. 3227 unsigned NumSections = VT.getSizeInBits() / 128; 3228 if (NumSections == 0 ) NumSections = 1; // Handle MMX 3229 unsigned NumSectionElts = NumElts / NumSections; 3230 3231 unsigned Start = 0; 3232 unsigned End = NumSectionElts; 3233 for (unsigned s = 0; s < NumSections; ++s) { 3234 for (unsigned i = Start, j = s * NumSectionElts; 3235 i != End; 3236 i += 2, ++j) { 3237 int BitI = Mask[i]; 3238 int BitI1 = Mask[i+1]; 3239 if (!isUndefOrEqual(BitI, j)) 3240 return false; 3241 if (V2IsSplat) { 3242 if (!isUndefOrEqual(BitI1, NumElts)) 3243 return false; 3244 } else { 3245 if (!isUndefOrEqual(BitI1, j + NumElts)) 3246 return false; 3247 } 3248 } 3249 // Process the next 128 bits. 3250 Start += NumSectionElts; 3251 End += NumSectionElts; 3252 } 3253 3254 return true; 3255} 3256 3257bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3258 SmallVector<int, 8> M; 3259 N->getMask(M); 3260 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3261} 3262 3263/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3264/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3265static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3266 bool V2IsSplat = false) { 3267 int NumElts = VT.getVectorNumElements(); 3268 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3269 return false; 3270 3271 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3272 int BitI = Mask[i]; 3273 int BitI1 = Mask[i+1]; 3274 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3275 return false; 3276 if (V2IsSplat) { 3277 if (isUndefOrEqual(BitI1, NumElts)) 3278 return false; 3279 } else { 3280 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3281 return false; 3282 } 3283 } 3284 return true; 3285} 3286 3287bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3288 SmallVector<int, 8> M; 3289 N->getMask(M); 3290 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3291} 3292 3293/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3294/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3295/// <0, 0, 1, 1> 3296static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3297 int NumElems = VT.getVectorNumElements(); 3298 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3299 return false; 3300 3301 // Handle vector lengths > 128 bits. Define a "section" as a set of 3302 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit 3303 // sections. 3304 unsigned NumSections = VT.getSizeInBits() / 128; 3305 if (NumSections == 0 ) NumSections = 1; // Handle MMX 3306 unsigned NumSectionElts = NumElems / NumSections; 3307 3308 for (unsigned s = 0; s < NumSections; ++s) { 3309 for (unsigned i = s * NumSectionElts, j = s * NumSectionElts; 3310 i != NumSectionElts * (s + 1); 3311 i += 2, ++j) { 3312 int BitI = Mask[i]; 3313 int BitI1 = Mask[i+1]; 3314 3315 if (!isUndefOrEqual(BitI, j)) 3316 return false; 3317 if (!isUndefOrEqual(BitI1, j)) 3318 return false; 3319 } 3320 } 3321 3322 return true; 3323} 3324 3325bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3326 SmallVector<int, 8> M; 3327 N->getMask(M); 3328 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3329} 3330 3331/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3332/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3333/// <2, 2, 3, 3> 3334static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3335 int NumElems = VT.getVectorNumElements(); 3336 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3337 return false; 3338 3339 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3340 int BitI = Mask[i]; 3341 int BitI1 = Mask[i+1]; 3342 if (!isUndefOrEqual(BitI, j)) 3343 return false; 3344 if (!isUndefOrEqual(BitI1, j)) 3345 return false; 3346 } 3347 return true; 3348} 3349 3350bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3351 SmallVector<int, 8> M; 3352 N->getMask(M); 3353 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3354} 3355 3356/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3357/// specifies a shuffle of elements that is suitable for input to MOVSS, 3358/// MOVSD, and MOVD, i.e. setting the lowest element. 3359static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3360 if (VT.getVectorElementType().getSizeInBits() < 32) 3361 return false; 3362 3363 int NumElts = VT.getVectorNumElements(); 3364 3365 if (!isUndefOrEqual(Mask[0], NumElts)) 3366 return false; 3367 3368 for (int i = 1; i < NumElts; ++i) 3369 if (!isUndefOrEqual(Mask[i], i)) 3370 return false; 3371 3372 return true; 3373} 3374 3375bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3376 SmallVector<int, 8> M; 3377 N->getMask(M); 3378 return ::isMOVLMask(M, N->getValueType(0)); 3379} 3380 3381/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3382/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3383/// element of vector 2 and the other elements to come from vector 1 in order. 3384static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3385 bool V2IsSplat = false, bool V2IsUndef = false) { 3386 int NumOps = VT.getVectorNumElements(); 3387 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3388 return false; 3389 3390 if (!isUndefOrEqual(Mask[0], 0)) 3391 return false; 3392 3393 for (int i = 1; i < NumOps; ++i) 3394 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3395 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3396 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3397 return false; 3398 3399 return true; 3400} 3401 3402static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3403 bool V2IsUndef = false) { 3404 SmallVector<int, 8> M; 3405 N->getMask(M); 3406 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3407} 3408 3409/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3410/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3411bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3412 if (N->getValueType(0).getVectorNumElements() != 4) 3413 return false; 3414 3415 // Expect 1, 1, 3, 3 3416 for (unsigned i = 0; i < 2; ++i) { 3417 int Elt = N->getMaskElt(i); 3418 if (Elt >= 0 && Elt != 1) 3419 return false; 3420 } 3421 3422 bool HasHi = false; 3423 for (unsigned i = 2; i < 4; ++i) { 3424 int Elt = N->getMaskElt(i); 3425 if (Elt >= 0 && Elt != 3) 3426 return false; 3427 if (Elt == 3) 3428 HasHi = true; 3429 } 3430 // Don't use movshdup if it can be done with a shufps. 3431 // FIXME: verify that matching u, u, 3, 3 is what we want. 3432 return HasHi; 3433} 3434 3435/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3436/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3437bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3438 if (N->getValueType(0).getVectorNumElements() != 4) 3439 return false; 3440 3441 // Expect 0, 0, 2, 2 3442 for (unsigned i = 0; i < 2; ++i) 3443 if (N->getMaskElt(i) > 0) 3444 return false; 3445 3446 bool HasHi = false; 3447 for (unsigned i = 2; i < 4; ++i) { 3448 int Elt = N->getMaskElt(i); 3449 if (Elt >= 0 && Elt != 2) 3450 return false; 3451 if (Elt == 2) 3452 HasHi = true; 3453 } 3454 // Don't use movsldup if it can be done with a shufps. 3455 return HasHi; 3456} 3457 3458/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3459/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3460bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3461 int e = N->getValueType(0).getVectorNumElements() / 2; 3462 3463 for (int i = 0; i < e; ++i) 3464 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3465 return false; 3466 for (int i = 0; i < e; ++i) 3467 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3468 return false; 3469 return true; 3470} 3471 3472/// isVEXTRACTF128Index - Return true if the specified 3473/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3474/// suitable for input to VEXTRACTF128. 3475bool X86::isVEXTRACTF128Index(SDNode *N) { 3476 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3477 return false; 3478 3479 // The index should be aligned on a 128-bit boundary. 3480 uint64_t Index = 3481 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3482 3483 unsigned VL = N->getValueType(0).getVectorNumElements(); 3484 unsigned VBits = N->getValueType(0).getSizeInBits(); 3485 unsigned ElSize = VBits / VL; 3486 bool Result = (Index * ElSize) % 128 == 0; 3487 3488 return Result; 3489} 3490 3491/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3492/// operand specifies a subvector insert that is suitable for input to 3493/// VINSERTF128. 3494bool X86::isVINSERTF128Index(SDNode *N) { 3495 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3496 return false; 3497 3498 // The index should be aligned on a 128-bit boundary. 3499 uint64_t Index = 3500 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3501 3502 unsigned VL = N->getValueType(0).getVectorNumElements(); 3503 unsigned VBits = N->getValueType(0).getSizeInBits(); 3504 unsigned ElSize = VBits / VL; 3505 bool Result = (Index * ElSize) % 128 == 0; 3506 3507 return Result; 3508} 3509 3510/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3511/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3512unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3513 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3514 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3515 3516 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3517 unsigned Mask = 0; 3518 for (int i = 0; i < NumOperands; ++i) { 3519 int Val = SVOp->getMaskElt(NumOperands-i-1); 3520 if (Val < 0) Val = 0; 3521 if (Val >= NumOperands) Val -= NumOperands; 3522 Mask |= Val; 3523 if (i != NumOperands - 1) 3524 Mask <<= Shift; 3525 } 3526 return Mask; 3527} 3528 3529/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3530/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3531unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3532 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3533 unsigned Mask = 0; 3534 // 8 nodes, but we only care about the last 4. 3535 for (unsigned i = 7; i >= 4; --i) { 3536 int Val = SVOp->getMaskElt(i); 3537 if (Val >= 0) 3538 Mask |= (Val - 4); 3539 if (i != 4) 3540 Mask <<= 2; 3541 } 3542 return Mask; 3543} 3544 3545/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3546/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3547unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3548 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3549 unsigned Mask = 0; 3550 // 8 nodes, but we only care about the first 4. 3551 for (int i = 3; i >= 0; --i) { 3552 int Val = SVOp->getMaskElt(i); 3553 if (Val >= 0) 3554 Mask |= Val; 3555 if (i != 0) 3556 Mask <<= 2; 3557 } 3558 return Mask; 3559} 3560 3561/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3562/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3563unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3564 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3565 EVT VVT = N->getValueType(0); 3566 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3567 int Val = 0; 3568 3569 unsigned i, e; 3570 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3571 Val = SVOp->getMaskElt(i); 3572 if (Val >= 0) 3573 break; 3574 } 3575 return (Val - i) * EltSize; 3576} 3577 3578/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3579/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3580/// instructions. 3581unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3582 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3583 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3584 3585 uint64_t Index = 3586 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3587 3588 EVT VecVT = N->getOperand(0).getValueType(); 3589 EVT ElVT = VecVT.getVectorElementType(); 3590 3591 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3592 3593 return Index / NumElemsPerChunk; 3594} 3595 3596/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3597/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3598/// instructions. 3599unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3600 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3601 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3602 3603 uint64_t Index = 3604 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3605 3606 EVT VecVT = N->getValueType(0); 3607 EVT ElVT = VecVT.getVectorElementType(); 3608 3609 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3610 3611 return Index / NumElemsPerChunk; 3612} 3613 3614/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3615/// constant +0.0. 3616bool X86::isZeroNode(SDValue Elt) { 3617 return ((isa<ConstantSDNode>(Elt) && 3618 cast<ConstantSDNode>(Elt)->isNullValue()) || 3619 (isa<ConstantFPSDNode>(Elt) && 3620 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3621} 3622 3623/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3624/// their permute mask. 3625static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3626 SelectionDAG &DAG) { 3627 EVT VT = SVOp->getValueType(0); 3628 unsigned NumElems = VT.getVectorNumElements(); 3629 SmallVector<int, 8> MaskVec; 3630 3631 for (unsigned i = 0; i != NumElems; ++i) { 3632 int idx = SVOp->getMaskElt(i); 3633 if (idx < 0) 3634 MaskVec.push_back(idx); 3635 else if (idx < (int)NumElems) 3636 MaskVec.push_back(idx + NumElems); 3637 else 3638 MaskVec.push_back(idx - NumElems); 3639 } 3640 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3641 SVOp->getOperand(0), &MaskVec[0]); 3642} 3643 3644/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3645/// the two vector operands have swapped position. 3646static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3647 unsigned NumElems = VT.getVectorNumElements(); 3648 for (unsigned i = 0; i != NumElems; ++i) { 3649 int idx = Mask[i]; 3650 if (idx < 0) 3651 continue; 3652 else if (idx < (int)NumElems) 3653 Mask[i] = idx + NumElems; 3654 else 3655 Mask[i] = idx - NumElems; 3656 } 3657} 3658 3659/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3660/// match movhlps. The lower half elements should come from upper half of 3661/// V1 (and in order), and the upper half elements should come from the upper 3662/// half of V2 (and in order). 3663static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3664 if (Op->getValueType(0).getVectorNumElements() != 4) 3665 return false; 3666 for (unsigned i = 0, e = 2; i != e; ++i) 3667 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3668 return false; 3669 for (unsigned i = 2; i != 4; ++i) 3670 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3671 return false; 3672 return true; 3673} 3674 3675/// isScalarLoadToVector - Returns true if the node is a scalar load that 3676/// is promoted to a vector. It also returns the LoadSDNode by reference if 3677/// required. 3678static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3679 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3680 return false; 3681 N = N->getOperand(0).getNode(); 3682 if (!ISD::isNON_EXTLoad(N)) 3683 return false; 3684 if (LD) 3685 *LD = cast<LoadSDNode>(N); 3686 return true; 3687} 3688 3689/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3690/// match movlp{s|d}. The lower half elements should come from lower half of 3691/// V1 (and in order), and the upper half elements should come from the upper 3692/// half of V2 (and in order). And since V1 will become the source of the 3693/// MOVLP, it must be either a vector load or a scalar load to vector. 3694static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3695 ShuffleVectorSDNode *Op) { 3696 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3697 return false; 3698 // Is V2 is a vector load, don't do this transformation. We will try to use 3699 // load folding shufps op. 3700 if (ISD::isNON_EXTLoad(V2)) 3701 return false; 3702 3703 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3704 3705 if (NumElems != 2 && NumElems != 4) 3706 return false; 3707 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3708 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3709 return false; 3710 for (unsigned i = NumElems/2; i != NumElems; ++i) 3711 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3712 return false; 3713 return true; 3714} 3715 3716/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3717/// all the same. 3718static bool isSplatVector(SDNode *N) { 3719 if (N->getOpcode() != ISD::BUILD_VECTOR) 3720 return false; 3721 3722 SDValue SplatValue = N->getOperand(0); 3723 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3724 if (N->getOperand(i) != SplatValue) 3725 return false; 3726 return true; 3727} 3728 3729/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3730/// to an zero vector. 3731/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3732static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3733 SDValue V1 = N->getOperand(0); 3734 SDValue V2 = N->getOperand(1); 3735 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3736 for (unsigned i = 0; i != NumElems; ++i) { 3737 int Idx = N->getMaskElt(i); 3738 if (Idx >= (int)NumElems) { 3739 unsigned Opc = V2.getOpcode(); 3740 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3741 continue; 3742 if (Opc != ISD::BUILD_VECTOR || 3743 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3744 return false; 3745 } else if (Idx >= 0) { 3746 unsigned Opc = V1.getOpcode(); 3747 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3748 continue; 3749 if (Opc != ISD::BUILD_VECTOR || 3750 !X86::isZeroNode(V1.getOperand(Idx))) 3751 return false; 3752 } 3753 } 3754 return true; 3755} 3756 3757/// getZeroVector - Returns a vector of specified type with all zero elements. 3758/// 3759static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3760 DebugLoc dl) { 3761 assert(VT.isVector() && "Expected a vector type"); 3762 3763 // Always build SSE zero vectors as <4 x i32> bitcasted 3764 // to their dest type. This ensures they get CSE'd. 3765 SDValue Vec; 3766 if (VT.getSizeInBits() == 128) { // SSE 3767 if (HasSSE2) { // SSE2 3768 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3769 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3770 } else { // SSE1 3771 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3772 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3773 } 3774 } else if (VT.getSizeInBits() == 256) { // AVX 3775 // 256-bit logic and arithmetic instructions in AVX are 3776 // all floating-point, no support for integer ops. Default 3777 // to emitting fp zeroed vectors then. 3778 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3779 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3780 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3781 } 3782 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3783} 3784 3785/// getOnesVector - Returns a vector of specified type with all bits set. 3786/// 3787static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3788 assert(VT.isVector() && "Expected a vector type"); 3789 3790 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3791 // type. This ensures they get CSE'd. 3792 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3793 SDValue Vec; 3794 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3795 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3796} 3797 3798 3799/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3800/// that point to V2 points to its first element. 3801static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3802 EVT VT = SVOp->getValueType(0); 3803 unsigned NumElems = VT.getVectorNumElements(); 3804 3805 bool Changed = false; 3806 SmallVector<int, 8> MaskVec; 3807 SVOp->getMask(MaskVec); 3808 3809 for (unsigned i = 0; i != NumElems; ++i) { 3810 if (MaskVec[i] > (int)NumElems) { 3811 MaskVec[i] = NumElems; 3812 Changed = true; 3813 } 3814 } 3815 if (Changed) 3816 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3817 SVOp->getOperand(1), &MaskVec[0]); 3818 return SDValue(SVOp, 0); 3819} 3820 3821/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3822/// operation of specified width. 3823static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3824 SDValue V2) { 3825 unsigned NumElems = VT.getVectorNumElements(); 3826 SmallVector<int, 8> Mask; 3827 Mask.push_back(NumElems); 3828 for (unsigned i = 1; i != NumElems; ++i) 3829 Mask.push_back(i); 3830 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3831} 3832 3833/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3834static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3835 SDValue V2) { 3836 unsigned NumElems = VT.getVectorNumElements(); 3837 SmallVector<int, 8> Mask; 3838 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3839 Mask.push_back(i); 3840 Mask.push_back(i + NumElems); 3841 } 3842 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3843} 3844 3845/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3846static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3847 SDValue V2) { 3848 unsigned NumElems = VT.getVectorNumElements(); 3849 unsigned Half = NumElems/2; 3850 SmallVector<int, 8> Mask; 3851 for (unsigned i = 0; i != Half; ++i) { 3852 Mask.push_back(i + Half); 3853 Mask.push_back(i + NumElems + Half); 3854 } 3855 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3856} 3857 3858/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3859static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3860 EVT PVT = MVT::v4f32; 3861 EVT VT = SV->getValueType(0); 3862 DebugLoc dl = SV->getDebugLoc(); 3863 SDValue V1 = SV->getOperand(0); 3864 int NumElems = VT.getVectorNumElements(); 3865 int EltNo = SV->getSplatIndex(); 3866 3867 // unpack elements to the correct location 3868 while (NumElems > 4) { 3869 if (EltNo < NumElems/2) { 3870 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3871 } else { 3872 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3873 EltNo -= NumElems/2; 3874 } 3875 NumElems >>= 1; 3876 } 3877 3878 // Perform the splat. 3879 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3880 V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); 3881 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3882 return DAG.getNode(ISD::BITCAST, dl, VT, V1); 3883} 3884 3885/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3886/// vector of zero or undef vector. This produces a shuffle where the low 3887/// element of V2 is swizzled into the zero/undef vector, landing at element 3888/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3889static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3890 bool isZero, bool HasSSE2, 3891 SelectionDAG &DAG) { 3892 EVT VT = V2.getValueType(); 3893 SDValue V1 = isZero 3894 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3895 unsigned NumElems = VT.getVectorNumElements(); 3896 SmallVector<int, 16> MaskVec; 3897 for (unsigned i = 0; i != NumElems; ++i) 3898 // If this is the insertion idx, put the low elt of V2 here. 3899 MaskVec.push_back(i == Idx ? NumElems : i); 3900 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3901} 3902 3903/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3904/// element of the result of the vector shuffle. 3905static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3906 unsigned Depth) { 3907 if (Depth == 6) 3908 return SDValue(); // Limit search depth. 3909 3910 SDValue V = SDValue(N, 0); 3911 EVT VT = V.getValueType(); 3912 unsigned Opcode = V.getOpcode(); 3913 3914 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3915 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3916 Index = SV->getMaskElt(Index); 3917 3918 if (Index < 0) 3919 return DAG.getUNDEF(VT.getVectorElementType()); 3920 3921 int NumElems = VT.getVectorNumElements(); 3922 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3923 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3924 } 3925 3926 // Recurse into target specific vector shuffles to find scalars. 3927 if (isTargetShuffle(Opcode)) { 3928 int NumElems = VT.getVectorNumElements(); 3929 SmallVector<unsigned, 16> ShuffleMask; 3930 SDValue ImmN; 3931 3932 switch(Opcode) { 3933 case X86ISD::SHUFPS: 3934 case X86ISD::SHUFPD: 3935 ImmN = N->getOperand(N->getNumOperands()-1); 3936 DecodeSHUFPSMask(NumElems, 3937 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3938 ShuffleMask); 3939 break; 3940 case X86ISD::PUNPCKHBW: 3941 case X86ISD::PUNPCKHWD: 3942 case X86ISD::PUNPCKHDQ: 3943 case X86ISD::PUNPCKHQDQ: 3944 DecodePUNPCKHMask(NumElems, ShuffleMask); 3945 break; 3946 case X86ISD::UNPCKHPS: 3947 case X86ISD::UNPCKHPD: 3948 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3949 break; 3950 case X86ISD::PUNPCKLBW: 3951 case X86ISD::PUNPCKLWD: 3952 case X86ISD::PUNPCKLDQ: 3953 case X86ISD::PUNPCKLQDQ: 3954 DecodePUNPCKLMask(VT, ShuffleMask); 3955 break; 3956 case X86ISD::UNPCKLPS: 3957 case X86ISD::UNPCKLPD: 3958 case X86ISD::VUNPCKLPS: 3959 case X86ISD::VUNPCKLPD: 3960 case X86ISD::VUNPCKLPSY: 3961 case X86ISD::VUNPCKLPDY: 3962 DecodeUNPCKLPMask(VT, ShuffleMask); 3963 break; 3964 case X86ISD::MOVHLPS: 3965 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3966 break; 3967 case X86ISD::MOVLHPS: 3968 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3969 break; 3970 case X86ISD::PSHUFD: 3971 ImmN = N->getOperand(N->getNumOperands()-1); 3972 DecodePSHUFMask(NumElems, 3973 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3974 ShuffleMask); 3975 break; 3976 case X86ISD::PSHUFHW: 3977 ImmN = N->getOperand(N->getNumOperands()-1); 3978 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3979 ShuffleMask); 3980 break; 3981 case X86ISD::PSHUFLW: 3982 ImmN = N->getOperand(N->getNumOperands()-1); 3983 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3984 ShuffleMask); 3985 break; 3986 case X86ISD::MOVSS: 3987 case X86ISD::MOVSD: { 3988 // The index 0 always comes from the first element of the second source, 3989 // this is why MOVSS and MOVSD are used in the first place. The other 3990 // elements come from the other positions of the first source vector. 3991 unsigned OpNum = (Index == 0) ? 1 : 0; 3992 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3993 Depth+1); 3994 } 3995 default: 3996 assert("not implemented for target shuffle node"); 3997 return SDValue(); 3998 } 3999 4000 Index = ShuffleMask[Index]; 4001 if (Index < 0) 4002 return DAG.getUNDEF(VT.getVectorElementType()); 4003 4004 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4005 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4006 Depth+1); 4007 } 4008 4009 // Actual nodes that may contain scalar elements 4010 if (Opcode == ISD::BITCAST) { 4011 V = V.getOperand(0); 4012 EVT SrcVT = V.getValueType(); 4013 unsigned NumElems = VT.getVectorNumElements(); 4014 4015 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4016 return SDValue(); 4017 } 4018 4019 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4020 return (Index == 0) ? V.getOperand(0) 4021 : DAG.getUNDEF(VT.getVectorElementType()); 4022 4023 if (V.getOpcode() == ISD::BUILD_VECTOR) 4024 return V.getOperand(Index); 4025 4026 return SDValue(); 4027} 4028 4029/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4030/// shuffle operation which come from a consecutively from a zero. The 4031/// search can start in two different directions, from left or right. 4032static 4033unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4034 bool ZerosFromLeft, SelectionDAG &DAG) { 4035 int i = 0; 4036 4037 while (i < NumElems) { 4038 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4039 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4040 if (!(Elt.getNode() && 4041 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4042 break; 4043 ++i; 4044 } 4045 4046 return i; 4047} 4048 4049/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4050/// MaskE correspond consecutively to elements from one of the vector operands, 4051/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4052static 4053bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4054 int OpIdx, int NumElems, unsigned &OpNum) { 4055 bool SeenV1 = false; 4056 bool SeenV2 = false; 4057 4058 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4059 int Idx = SVOp->getMaskElt(i); 4060 // Ignore undef indicies 4061 if (Idx < 0) 4062 continue; 4063 4064 if (Idx < NumElems) 4065 SeenV1 = true; 4066 else 4067 SeenV2 = true; 4068 4069 // Only accept consecutive elements from the same vector 4070 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4071 return false; 4072 } 4073 4074 OpNum = SeenV1 ? 0 : 1; 4075 return true; 4076} 4077 4078/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4079/// logical left shift of a vector. 4080static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4081 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4082 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4083 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4084 false /* check zeros from right */, DAG); 4085 unsigned OpSrc; 4086 4087 if (!NumZeros) 4088 return false; 4089 4090 // Considering the elements in the mask that are not consecutive zeros, 4091 // check if they consecutively come from only one of the source vectors. 4092 // 4093 // V1 = {X, A, B, C} 0 4094 // \ \ \ / 4095 // vector_shuffle V1, V2 <1, 2, 3, X> 4096 // 4097 if (!isShuffleMaskConsecutive(SVOp, 4098 0, // Mask Start Index 4099 NumElems-NumZeros-1, // Mask End Index 4100 NumZeros, // Where to start looking in the src vector 4101 NumElems, // Number of elements in vector 4102 OpSrc)) // Which source operand ? 4103 return false; 4104 4105 isLeft = false; 4106 ShAmt = NumZeros; 4107 ShVal = SVOp->getOperand(OpSrc); 4108 return true; 4109} 4110 4111/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4112/// logical left shift of a vector. 4113static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4114 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4115 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4116 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4117 true /* check zeros from left */, DAG); 4118 unsigned OpSrc; 4119 4120 if (!NumZeros) 4121 return false; 4122 4123 // Considering the elements in the mask that are not consecutive zeros, 4124 // check if they consecutively come from only one of the source vectors. 4125 // 4126 // 0 { A, B, X, X } = V2 4127 // / \ / / 4128 // vector_shuffle V1, V2 <X, X, 4, 5> 4129 // 4130 if (!isShuffleMaskConsecutive(SVOp, 4131 NumZeros, // Mask Start Index 4132 NumElems-1, // Mask End Index 4133 0, // Where to start looking in the src vector 4134 NumElems, // Number of elements in vector 4135 OpSrc)) // Which source operand ? 4136 return false; 4137 4138 isLeft = true; 4139 ShAmt = NumZeros; 4140 ShVal = SVOp->getOperand(OpSrc); 4141 return true; 4142} 4143 4144/// isVectorShift - Returns true if the shuffle can be implemented as a 4145/// logical left or right shift of a vector. 4146static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4147 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4148 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4149 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4150 return true; 4151 4152 return false; 4153} 4154 4155/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4156/// 4157static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4158 unsigned NumNonZero, unsigned NumZero, 4159 SelectionDAG &DAG, 4160 const TargetLowering &TLI) { 4161 if (NumNonZero > 8) 4162 return SDValue(); 4163 4164 DebugLoc dl = Op.getDebugLoc(); 4165 SDValue V(0, 0); 4166 bool First = true; 4167 for (unsigned i = 0; i < 16; ++i) { 4168 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4169 if (ThisIsNonZero && First) { 4170 if (NumZero) 4171 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4172 else 4173 V = DAG.getUNDEF(MVT::v8i16); 4174 First = false; 4175 } 4176 4177 if ((i & 1) != 0) { 4178 SDValue ThisElt(0, 0), LastElt(0, 0); 4179 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4180 if (LastIsNonZero) { 4181 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4182 MVT::i16, Op.getOperand(i-1)); 4183 } 4184 if (ThisIsNonZero) { 4185 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4186 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4187 ThisElt, DAG.getConstant(8, MVT::i8)); 4188 if (LastIsNonZero) 4189 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4190 } else 4191 ThisElt = LastElt; 4192 4193 if (ThisElt.getNode()) 4194 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4195 DAG.getIntPtrConstant(i/2)); 4196 } 4197 } 4198 4199 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4200} 4201 4202/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4203/// 4204static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4205 unsigned NumNonZero, unsigned NumZero, 4206 SelectionDAG &DAG, 4207 const TargetLowering &TLI) { 4208 if (NumNonZero > 4) 4209 return SDValue(); 4210 4211 DebugLoc dl = Op.getDebugLoc(); 4212 SDValue V(0, 0); 4213 bool First = true; 4214 for (unsigned i = 0; i < 8; ++i) { 4215 bool isNonZero = (NonZeros & (1 << i)) != 0; 4216 if (isNonZero) { 4217 if (First) { 4218 if (NumZero) 4219 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4220 else 4221 V = DAG.getUNDEF(MVT::v8i16); 4222 First = false; 4223 } 4224 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4225 MVT::v8i16, V, Op.getOperand(i), 4226 DAG.getIntPtrConstant(i)); 4227 } 4228 } 4229 4230 return V; 4231} 4232 4233/// getVShift - Return a vector logical shift node. 4234/// 4235static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4236 unsigned NumBits, SelectionDAG &DAG, 4237 const TargetLowering &TLI, DebugLoc dl) { 4238 EVT ShVT = MVT::v2i64; 4239 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4240 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4241 return DAG.getNode(ISD::BITCAST, dl, VT, 4242 DAG.getNode(Opc, dl, ShVT, SrcOp, 4243 DAG.getConstant(NumBits, 4244 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4245} 4246 4247SDValue 4248X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4249 SelectionDAG &DAG) const { 4250 4251 // Check if the scalar load can be widened into a vector load. And if 4252 // the address is "base + cst" see if the cst can be "absorbed" into 4253 // the shuffle mask. 4254 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4255 SDValue Ptr = LD->getBasePtr(); 4256 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4257 return SDValue(); 4258 EVT PVT = LD->getValueType(0); 4259 if (PVT != MVT::i32 && PVT != MVT::f32) 4260 return SDValue(); 4261 4262 int FI = -1; 4263 int64_t Offset = 0; 4264 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4265 FI = FINode->getIndex(); 4266 Offset = 0; 4267 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4268 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4269 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4270 Offset = Ptr.getConstantOperandVal(1); 4271 Ptr = Ptr.getOperand(0); 4272 } else { 4273 return SDValue(); 4274 } 4275 4276 SDValue Chain = LD->getChain(); 4277 // Make sure the stack object alignment is at least 16. 4278 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4279 if (DAG.InferPtrAlignment(Ptr) < 16) { 4280 if (MFI->isFixedObjectIndex(FI)) { 4281 // Can't change the alignment. FIXME: It's possible to compute 4282 // the exact stack offset and reference FI + adjust offset instead. 4283 // If someone *really* cares about this. That's the way to implement it. 4284 return SDValue(); 4285 } else { 4286 MFI->setObjectAlignment(FI, 16); 4287 } 4288 } 4289 4290 // (Offset % 16) must be multiple of 4. Then address is then 4291 // Ptr + (Offset & ~15). 4292 if (Offset < 0) 4293 return SDValue(); 4294 if ((Offset % 16) & 3) 4295 return SDValue(); 4296 int64_t StartOffset = Offset & ~15; 4297 if (StartOffset) 4298 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4299 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4300 4301 int EltNo = (Offset - StartOffset) >> 2; 4302 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4303 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4304 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4305 LD->getPointerInfo().getWithOffset(StartOffset), 4306 false, false, 0); 4307 // Canonicalize it to a v4i32 shuffle. 4308 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4309 return DAG.getNode(ISD::BITCAST, dl, VT, 4310 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4311 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4312 } 4313 4314 return SDValue(); 4315} 4316 4317/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4318/// vector of type 'VT', see if the elements can be replaced by a single large 4319/// load which has the same value as a build_vector whose operands are 'elts'. 4320/// 4321/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4322/// 4323/// FIXME: we'd also like to handle the case where the last elements are zero 4324/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4325/// There's even a handy isZeroNode for that purpose. 4326static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4327 DebugLoc &DL, SelectionDAG &DAG) { 4328 EVT EltVT = VT.getVectorElementType(); 4329 unsigned NumElems = Elts.size(); 4330 4331 LoadSDNode *LDBase = NULL; 4332 unsigned LastLoadedElt = -1U; 4333 4334 // For each element in the initializer, see if we've found a load or an undef. 4335 // If we don't find an initial load element, or later load elements are 4336 // non-consecutive, bail out. 4337 for (unsigned i = 0; i < NumElems; ++i) { 4338 SDValue Elt = Elts[i]; 4339 4340 if (!Elt.getNode() || 4341 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4342 return SDValue(); 4343 if (!LDBase) { 4344 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4345 return SDValue(); 4346 LDBase = cast<LoadSDNode>(Elt.getNode()); 4347 LastLoadedElt = i; 4348 continue; 4349 } 4350 if (Elt.getOpcode() == ISD::UNDEF) 4351 continue; 4352 4353 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4354 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4355 return SDValue(); 4356 LastLoadedElt = i; 4357 } 4358 4359 // If we have found an entire vector of loads and undefs, then return a large 4360 // load of the entire vector width starting at the base pointer. If we found 4361 // consecutive loads for the low half, generate a vzext_load node. 4362 if (LastLoadedElt == NumElems - 1) { 4363 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4364 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4365 LDBase->getPointerInfo(), 4366 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4367 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4368 LDBase->getPointerInfo(), 4369 LDBase->isVolatile(), LDBase->isNonTemporal(), 4370 LDBase->getAlignment()); 4371 } else if (NumElems == 4 && LastLoadedElt == 1) { 4372 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4373 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4374 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4375 Ops, 2, MVT::i32, 4376 LDBase->getMemOperand()); 4377 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4378 } 4379 return SDValue(); 4380} 4381 4382SDValue 4383X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4384 DebugLoc dl = Op.getDebugLoc(); 4385 4386 EVT VT = Op.getValueType(); 4387 EVT ExtVT = VT.getVectorElementType(); 4388 4389 unsigned NumElems = Op.getNumOperands(); 4390 4391 // For AVX-length vectors, build the individual 128-bit pieces and 4392 // use shuffles to put them in place. 4393 if (VT.getSizeInBits() > 256 && 4394 Subtarget->hasAVX() && 4395 !ISD::isBuildVectorAllZeros(Op.getNode())) { 4396 SmallVector<SDValue, 8> V; 4397 V.resize(NumElems); 4398 for (unsigned i = 0; i < NumElems; ++i) { 4399 V[i] = Op.getOperand(i); 4400 } 4401 4402 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4403 4404 // Build the lower subvector. 4405 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4406 // Build the upper subvector. 4407 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4408 NumElems/2); 4409 4410 return ConcatVectors(Lower, Upper, DAG); 4411 } 4412 4413 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4414 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4415 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4416 // is present, so AllOnes is ignored. 4417 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4418 (Op.getValueType().getSizeInBits() != 256 && 4419 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4420 // Canonicalize this to <4 x i32> (SSE) to 4421 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4422 // eliminated on x86-32 hosts. 4423 if (Op.getValueType() == MVT::v4i32) 4424 return Op; 4425 4426 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4427 return getOnesVector(Op.getValueType(), DAG, dl); 4428 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4429 } 4430 4431 unsigned EVTBits = ExtVT.getSizeInBits(); 4432 4433 unsigned NumZero = 0; 4434 unsigned NumNonZero = 0; 4435 unsigned NonZeros = 0; 4436 bool IsAllConstants = true; 4437 SmallSet<SDValue, 8> Values; 4438 for (unsigned i = 0; i < NumElems; ++i) { 4439 SDValue Elt = Op.getOperand(i); 4440 if (Elt.getOpcode() == ISD::UNDEF) 4441 continue; 4442 Values.insert(Elt); 4443 if (Elt.getOpcode() != ISD::Constant && 4444 Elt.getOpcode() != ISD::ConstantFP) 4445 IsAllConstants = false; 4446 if (X86::isZeroNode(Elt)) 4447 NumZero++; 4448 else { 4449 NonZeros |= (1 << i); 4450 NumNonZero++; 4451 } 4452 } 4453 4454 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4455 if (NumNonZero == 0) 4456 return DAG.getUNDEF(VT); 4457 4458 // Special case for single non-zero, non-undef, element. 4459 if (NumNonZero == 1) { 4460 unsigned Idx = CountTrailingZeros_32(NonZeros); 4461 SDValue Item = Op.getOperand(Idx); 4462 4463 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4464 // the value are obviously zero, truncate the value to i32 and do the 4465 // insertion that way. Only do this if the value is non-constant or if the 4466 // value is a constant being inserted into element 0. It is cheaper to do 4467 // a constant pool load than it is to do a movd + shuffle. 4468 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4469 (!IsAllConstants || Idx == 0)) { 4470 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4471 // Handle SSE only. 4472 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4473 EVT VecVT = MVT::v4i32; 4474 unsigned VecElts = 4; 4475 4476 // Truncate the value (which may itself be a constant) to i32, and 4477 // convert it to a vector with movd (S2V+shuffle to zero extend). 4478 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4479 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4480 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4481 Subtarget->hasSSE2(), DAG); 4482 4483 // Now we have our 32-bit value zero extended in the low element of 4484 // a vector. If Idx != 0, swizzle it into place. 4485 if (Idx != 0) { 4486 SmallVector<int, 4> Mask; 4487 Mask.push_back(Idx); 4488 for (unsigned i = 1; i != VecElts; ++i) 4489 Mask.push_back(i); 4490 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4491 DAG.getUNDEF(Item.getValueType()), 4492 &Mask[0]); 4493 } 4494 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4495 } 4496 } 4497 4498 // If we have a constant or non-constant insertion into the low element of 4499 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4500 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4501 // depending on what the source datatype is. 4502 if (Idx == 0) { 4503 if (NumZero == 0) { 4504 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4505 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4506 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4507 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4508 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4509 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4510 DAG); 4511 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4512 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4513 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4514 EVT MiddleVT = MVT::v4i32; 4515 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4516 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4517 Subtarget->hasSSE2(), DAG); 4518 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4519 } 4520 } 4521 4522 // Is it a vector logical left shift? 4523 if (NumElems == 2 && Idx == 1 && 4524 X86::isZeroNode(Op.getOperand(0)) && 4525 !X86::isZeroNode(Op.getOperand(1))) { 4526 unsigned NumBits = VT.getSizeInBits(); 4527 return getVShift(true, VT, 4528 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4529 VT, Op.getOperand(1)), 4530 NumBits/2, DAG, *this, dl); 4531 } 4532 4533 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4534 return SDValue(); 4535 4536 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4537 // is a non-constant being inserted into an element other than the low one, 4538 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4539 // movd/movss) to move this into the low element, then shuffle it into 4540 // place. 4541 if (EVTBits == 32) { 4542 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4543 4544 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4545 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4546 Subtarget->hasSSE2(), DAG); 4547 SmallVector<int, 8> MaskVec; 4548 for (unsigned i = 0; i < NumElems; i++) 4549 MaskVec.push_back(i == Idx ? 0 : 1); 4550 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4551 } 4552 } 4553 4554 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4555 if (Values.size() == 1) { 4556 if (EVTBits == 32) { 4557 // Instead of a shuffle like this: 4558 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4559 // Check if it's possible to issue this instead. 4560 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4561 unsigned Idx = CountTrailingZeros_32(NonZeros); 4562 SDValue Item = Op.getOperand(Idx); 4563 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4564 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4565 } 4566 return SDValue(); 4567 } 4568 4569 // A vector full of immediates; various special cases are already 4570 // handled, so this is best done with a single constant-pool load. 4571 if (IsAllConstants) 4572 return SDValue(); 4573 4574 // Let legalizer expand 2-wide build_vectors. 4575 if (EVTBits == 64) { 4576 if (NumNonZero == 1) { 4577 // One half is zero or undef. 4578 unsigned Idx = CountTrailingZeros_32(NonZeros); 4579 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4580 Op.getOperand(Idx)); 4581 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4582 Subtarget->hasSSE2(), DAG); 4583 } 4584 return SDValue(); 4585 } 4586 4587 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4588 if (EVTBits == 8 && NumElems == 16) { 4589 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4590 *this); 4591 if (V.getNode()) return V; 4592 } 4593 4594 if (EVTBits == 16 && NumElems == 8) { 4595 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4596 *this); 4597 if (V.getNode()) return V; 4598 } 4599 4600 // If element VT is == 32 bits, turn it into a number of shuffles. 4601 SmallVector<SDValue, 8> V; 4602 V.resize(NumElems); 4603 if (NumElems == 4 && NumZero > 0) { 4604 for (unsigned i = 0; i < 4; ++i) { 4605 bool isZero = !(NonZeros & (1 << i)); 4606 if (isZero) 4607 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4608 else 4609 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4610 } 4611 4612 for (unsigned i = 0; i < 2; ++i) { 4613 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4614 default: break; 4615 case 0: 4616 V[i] = V[i*2]; // Must be a zero vector. 4617 break; 4618 case 1: 4619 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4620 break; 4621 case 2: 4622 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4623 break; 4624 case 3: 4625 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4626 break; 4627 } 4628 } 4629 4630 SmallVector<int, 8> MaskVec; 4631 bool Reverse = (NonZeros & 0x3) == 2; 4632 for (unsigned i = 0; i < 2; ++i) 4633 MaskVec.push_back(Reverse ? 1-i : i); 4634 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4635 for (unsigned i = 0; i < 2; ++i) 4636 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4637 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4638 } 4639 4640 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4641 // Check for a build vector of consecutive loads. 4642 for (unsigned i = 0; i < NumElems; ++i) 4643 V[i] = Op.getOperand(i); 4644 4645 // Check for elements which are consecutive loads. 4646 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4647 if (LD.getNode()) 4648 return LD; 4649 4650 // For SSE 4.1, use insertps to put the high elements into the low element. 4651 if (getSubtarget()->hasSSE41()) { 4652 SDValue Result; 4653 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4654 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4655 else 4656 Result = DAG.getUNDEF(VT); 4657 4658 for (unsigned i = 1; i < NumElems; ++i) { 4659 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4660 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4661 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4662 } 4663 return Result; 4664 } 4665 4666 // Otherwise, expand into a number of unpckl*, start by extending each of 4667 // our (non-undef) elements to the full vector width with the element in the 4668 // bottom slot of the vector (which generates no code for SSE). 4669 for (unsigned i = 0; i < NumElems; ++i) { 4670 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4671 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4672 else 4673 V[i] = DAG.getUNDEF(VT); 4674 } 4675 4676 // Next, we iteratively mix elements, e.g. for v4f32: 4677 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4678 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4679 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4680 unsigned EltStride = NumElems >> 1; 4681 while (EltStride != 0) { 4682 for (unsigned i = 0; i < EltStride; ++i) { 4683 // If V[i+EltStride] is undef and this is the first round of mixing, 4684 // then it is safe to just drop this shuffle: V[i] is already in the 4685 // right place, the one element (since it's the first round) being 4686 // inserted as undef can be dropped. This isn't safe for successive 4687 // rounds because they will permute elements within both vectors. 4688 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4689 EltStride == NumElems/2) 4690 continue; 4691 4692 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4693 } 4694 EltStride >>= 1; 4695 } 4696 return V[0]; 4697 } 4698 return SDValue(); 4699} 4700 4701SDValue 4702X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4703 // We support concatenate two MMX registers and place them in a MMX 4704 // register. This is better than doing a stack convert. 4705 DebugLoc dl = Op.getDebugLoc(); 4706 EVT ResVT = Op.getValueType(); 4707 assert(Op.getNumOperands() == 2); 4708 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4709 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4710 int Mask[2]; 4711 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4712 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4713 InVec = Op.getOperand(1); 4714 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4715 unsigned NumElts = ResVT.getVectorNumElements(); 4716 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4717 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4718 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4719 } else { 4720 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4721 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4722 Mask[0] = 0; Mask[1] = 2; 4723 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4724 } 4725 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4726} 4727 4728// v8i16 shuffles - Prefer shuffles in the following order: 4729// 1. [all] pshuflw, pshufhw, optional move 4730// 2. [ssse3] 1 x pshufb 4731// 3. [ssse3] 2 x pshufb + 1 x por 4732// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4733SDValue 4734X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4735 SelectionDAG &DAG) const { 4736 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4737 SDValue V1 = SVOp->getOperand(0); 4738 SDValue V2 = SVOp->getOperand(1); 4739 DebugLoc dl = SVOp->getDebugLoc(); 4740 SmallVector<int, 8> MaskVals; 4741 4742 // Determine if more than 1 of the words in each of the low and high quadwords 4743 // of the result come from the same quadword of one of the two inputs. Undef 4744 // mask values count as coming from any quadword, for better codegen. 4745 SmallVector<unsigned, 4> LoQuad(4); 4746 SmallVector<unsigned, 4> HiQuad(4); 4747 BitVector InputQuads(4); 4748 for (unsigned i = 0; i < 8; ++i) { 4749 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4750 int EltIdx = SVOp->getMaskElt(i); 4751 MaskVals.push_back(EltIdx); 4752 if (EltIdx < 0) { 4753 ++Quad[0]; 4754 ++Quad[1]; 4755 ++Quad[2]; 4756 ++Quad[3]; 4757 continue; 4758 } 4759 ++Quad[EltIdx / 4]; 4760 InputQuads.set(EltIdx / 4); 4761 } 4762 4763 int BestLoQuad = -1; 4764 unsigned MaxQuad = 1; 4765 for (unsigned i = 0; i < 4; ++i) { 4766 if (LoQuad[i] > MaxQuad) { 4767 BestLoQuad = i; 4768 MaxQuad = LoQuad[i]; 4769 } 4770 } 4771 4772 int BestHiQuad = -1; 4773 MaxQuad = 1; 4774 for (unsigned i = 0; i < 4; ++i) { 4775 if (HiQuad[i] > MaxQuad) { 4776 BestHiQuad = i; 4777 MaxQuad = HiQuad[i]; 4778 } 4779 } 4780 4781 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4782 // of the two input vectors, shuffle them into one input vector so only a 4783 // single pshufb instruction is necessary. If There are more than 2 input 4784 // quads, disable the next transformation since it does not help SSSE3. 4785 bool V1Used = InputQuads[0] || InputQuads[1]; 4786 bool V2Used = InputQuads[2] || InputQuads[3]; 4787 if (Subtarget->hasSSSE3()) { 4788 if (InputQuads.count() == 2 && V1Used && V2Used) { 4789 BestLoQuad = InputQuads.find_first(); 4790 BestHiQuad = InputQuads.find_next(BestLoQuad); 4791 } 4792 if (InputQuads.count() > 2) { 4793 BestLoQuad = -1; 4794 BestHiQuad = -1; 4795 } 4796 } 4797 4798 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4799 // the shuffle mask. If a quad is scored as -1, that means that it contains 4800 // words from all 4 input quadwords. 4801 SDValue NewV; 4802 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4803 SmallVector<int, 8> MaskV; 4804 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4805 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4806 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4807 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4808 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4809 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4810 4811 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4812 // source words for the shuffle, to aid later transformations. 4813 bool AllWordsInNewV = true; 4814 bool InOrder[2] = { true, true }; 4815 for (unsigned i = 0; i != 8; ++i) { 4816 int idx = MaskVals[i]; 4817 if (idx != (int)i) 4818 InOrder[i/4] = false; 4819 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4820 continue; 4821 AllWordsInNewV = false; 4822 break; 4823 } 4824 4825 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4826 if (AllWordsInNewV) { 4827 for (int i = 0; i != 8; ++i) { 4828 int idx = MaskVals[i]; 4829 if (idx < 0) 4830 continue; 4831 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4832 if ((idx != i) && idx < 4) 4833 pshufhw = false; 4834 if ((idx != i) && idx > 3) 4835 pshuflw = false; 4836 } 4837 V1 = NewV; 4838 V2Used = false; 4839 BestLoQuad = 0; 4840 BestHiQuad = 1; 4841 } 4842 4843 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4844 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4845 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4846 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4847 unsigned TargetMask = 0; 4848 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4849 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4850 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4851 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4852 V1 = NewV.getOperand(0); 4853 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4854 } 4855 } 4856 4857 // If we have SSSE3, and all words of the result are from 1 input vector, 4858 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4859 // is present, fall back to case 4. 4860 if (Subtarget->hasSSSE3()) { 4861 SmallVector<SDValue,16> pshufbMask; 4862 4863 // If we have elements from both input vectors, set the high bit of the 4864 // shuffle mask element to zero out elements that come from V2 in the V1 4865 // mask, and elements that come from V1 in the V2 mask, so that the two 4866 // results can be OR'd together. 4867 bool TwoInputs = V1Used && V2Used; 4868 for (unsigned i = 0; i != 8; ++i) { 4869 int EltIdx = MaskVals[i] * 2; 4870 if (TwoInputs && (EltIdx >= 16)) { 4871 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4872 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4873 continue; 4874 } 4875 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4876 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4877 } 4878 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 4879 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4880 DAG.getNode(ISD::BUILD_VECTOR, dl, 4881 MVT::v16i8, &pshufbMask[0], 16)); 4882 if (!TwoInputs) 4883 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4884 4885 // Calculate the shuffle mask for the second input, shuffle it, and 4886 // OR it with the first shuffled input. 4887 pshufbMask.clear(); 4888 for (unsigned i = 0; i != 8; ++i) { 4889 int EltIdx = MaskVals[i] * 2; 4890 if (EltIdx < 16) { 4891 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4892 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4893 continue; 4894 } 4895 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4896 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4897 } 4898 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 4899 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4900 DAG.getNode(ISD::BUILD_VECTOR, dl, 4901 MVT::v16i8, &pshufbMask[0], 16)); 4902 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4903 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4904 } 4905 4906 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4907 // and update MaskVals with new element order. 4908 BitVector InOrder(8); 4909 if (BestLoQuad >= 0) { 4910 SmallVector<int, 8> MaskV; 4911 for (int i = 0; i != 4; ++i) { 4912 int idx = MaskVals[i]; 4913 if (idx < 0) { 4914 MaskV.push_back(-1); 4915 InOrder.set(i); 4916 } else if ((idx / 4) == BestLoQuad) { 4917 MaskV.push_back(idx & 3); 4918 InOrder.set(i); 4919 } else { 4920 MaskV.push_back(-1); 4921 } 4922 } 4923 for (unsigned i = 4; i != 8; ++i) 4924 MaskV.push_back(i); 4925 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4926 &MaskV[0]); 4927 4928 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4929 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4930 NewV.getOperand(0), 4931 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4932 DAG); 4933 } 4934 4935 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4936 // and update MaskVals with the new element order. 4937 if (BestHiQuad >= 0) { 4938 SmallVector<int, 8> MaskV; 4939 for (unsigned i = 0; i != 4; ++i) 4940 MaskV.push_back(i); 4941 for (unsigned i = 4; i != 8; ++i) { 4942 int idx = MaskVals[i]; 4943 if (idx < 0) { 4944 MaskV.push_back(-1); 4945 InOrder.set(i); 4946 } else if ((idx / 4) == BestHiQuad) { 4947 MaskV.push_back((idx & 3) + 4); 4948 InOrder.set(i); 4949 } else { 4950 MaskV.push_back(-1); 4951 } 4952 } 4953 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4954 &MaskV[0]); 4955 4956 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4957 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4958 NewV.getOperand(0), 4959 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4960 DAG); 4961 } 4962 4963 // In case BestHi & BestLo were both -1, which means each quadword has a word 4964 // from each of the four input quadwords, calculate the InOrder bitvector now 4965 // before falling through to the insert/extract cleanup. 4966 if (BestLoQuad == -1 && BestHiQuad == -1) { 4967 NewV = V1; 4968 for (int i = 0; i != 8; ++i) 4969 if (MaskVals[i] < 0 || MaskVals[i] == i) 4970 InOrder.set(i); 4971 } 4972 4973 // The other elements are put in the right place using pextrw and pinsrw. 4974 for (unsigned i = 0; i != 8; ++i) { 4975 if (InOrder[i]) 4976 continue; 4977 int EltIdx = MaskVals[i]; 4978 if (EltIdx < 0) 4979 continue; 4980 SDValue ExtOp = (EltIdx < 8) 4981 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4982 DAG.getIntPtrConstant(EltIdx)) 4983 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4984 DAG.getIntPtrConstant(EltIdx - 8)); 4985 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4986 DAG.getIntPtrConstant(i)); 4987 } 4988 return NewV; 4989} 4990 4991// v16i8 shuffles - Prefer shuffles in the following order: 4992// 1. [ssse3] 1 x pshufb 4993// 2. [ssse3] 2 x pshufb + 1 x por 4994// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4995static 4996SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4997 SelectionDAG &DAG, 4998 const X86TargetLowering &TLI) { 4999 SDValue V1 = SVOp->getOperand(0); 5000 SDValue V2 = SVOp->getOperand(1); 5001 DebugLoc dl = SVOp->getDebugLoc(); 5002 SmallVector<int, 16> MaskVals; 5003 SVOp->getMask(MaskVals); 5004 5005 // If we have SSSE3, case 1 is generated when all result bytes come from 5006 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5007 // present, fall back to case 3. 5008 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5009 bool V1Only = true; 5010 bool V2Only = true; 5011 for (unsigned i = 0; i < 16; ++i) { 5012 int EltIdx = MaskVals[i]; 5013 if (EltIdx < 0) 5014 continue; 5015 if (EltIdx < 16) 5016 V2Only = false; 5017 else 5018 V1Only = false; 5019 } 5020 5021 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5022 if (TLI.getSubtarget()->hasSSSE3()) { 5023 SmallVector<SDValue,16> pshufbMask; 5024 5025 // If all result elements are from one input vector, then only translate 5026 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5027 // 5028 // Otherwise, we have elements from both input vectors, and must zero out 5029 // elements that come from V2 in the first mask, and V1 in the second mask 5030 // so that we can OR them together. 5031 bool TwoInputs = !(V1Only || V2Only); 5032 for (unsigned i = 0; i != 16; ++i) { 5033 int EltIdx = MaskVals[i]; 5034 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5035 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5036 continue; 5037 } 5038 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5039 } 5040 // If all the elements are from V2, assign it to V1 and return after 5041 // building the first pshufb. 5042 if (V2Only) 5043 V1 = V2; 5044 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5045 DAG.getNode(ISD::BUILD_VECTOR, dl, 5046 MVT::v16i8, &pshufbMask[0], 16)); 5047 if (!TwoInputs) 5048 return V1; 5049 5050 // Calculate the shuffle mask for the second input, shuffle it, and 5051 // OR it with the first shuffled input. 5052 pshufbMask.clear(); 5053 for (unsigned i = 0; i != 16; ++i) { 5054 int EltIdx = MaskVals[i]; 5055 if (EltIdx < 16) { 5056 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5057 continue; 5058 } 5059 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5060 } 5061 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5062 DAG.getNode(ISD::BUILD_VECTOR, dl, 5063 MVT::v16i8, &pshufbMask[0], 16)); 5064 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5065 } 5066 5067 // No SSSE3 - Calculate in place words and then fix all out of place words 5068 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5069 // the 16 different words that comprise the two doublequadword input vectors. 5070 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5071 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5072 SDValue NewV = V2Only ? V2 : V1; 5073 for (int i = 0; i != 8; ++i) { 5074 int Elt0 = MaskVals[i*2]; 5075 int Elt1 = MaskVals[i*2+1]; 5076 5077 // This word of the result is all undef, skip it. 5078 if (Elt0 < 0 && Elt1 < 0) 5079 continue; 5080 5081 // This word of the result is already in the correct place, skip it. 5082 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5083 continue; 5084 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5085 continue; 5086 5087 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5088 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5089 SDValue InsElt; 5090 5091 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5092 // using a single extract together, load it and store it. 5093 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5094 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5095 DAG.getIntPtrConstant(Elt1 / 2)); 5096 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5097 DAG.getIntPtrConstant(i)); 5098 continue; 5099 } 5100 5101 // If Elt1 is defined, extract it from the appropriate source. If the 5102 // source byte is not also odd, shift the extracted word left 8 bits 5103 // otherwise clear the bottom 8 bits if we need to do an or. 5104 if (Elt1 >= 0) { 5105 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5106 DAG.getIntPtrConstant(Elt1 / 2)); 5107 if ((Elt1 & 1) == 0) 5108 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5109 DAG.getConstant(8, 5110 TLI.getShiftAmountTy(InsElt.getValueType()))); 5111 else if (Elt0 >= 0) 5112 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5113 DAG.getConstant(0xFF00, MVT::i16)); 5114 } 5115 // If Elt0 is defined, extract it from the appropriate source. If the 5116 // source byte is not also even, shift the extracted word right 8 bits. If 5117 // Elt1 was also defined, OR the extracted values together before 5118 // inserting them in the result. 5119 if (Elt0 >= 0) { 5120 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5121 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5122 if ((Elt0 & 1) != 0) 5123 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5124 DAG.getConstant(8, 5125 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5126 else if (Elt1 >= 0) 5127 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5128 DAG.getConstant(0x00FF, MVT::i16)); 5129 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5130 : InsElt0; 5131 } 5132 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5133 DAG.getIntPtrConstant(i)); 5134 } 5135 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5136} 5137 5138/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5139/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5140/// done when every pair / quad of shuffle mask elements point to elements in 5141/// the right sequence. e.g. 5142/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5143static 5144SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5145 SelectionDAG &DAG, DebugLoc dl) { 5146 EVT VT = SVOp->getValueType(0); 5147 SDValue V1 = SVOp->getOperand(0); 5148 SDValue V2 = SVOp->getOperand(1); 5149 unsigned NumElems = VT.getVectorNumElements(); 5150 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5151 EVT NewVT; 5152 switch (VT.getSimpleVT().SimpleTy) { 5153 default: assert(false && "Unexpected!"); 5154 case MVT::v4f32: NewVT = MVT::v2f64; break; 5155 case MVT::v4i32: NewVT = MVT::v2i64; break; 5156 case MVT::v8i16: NewVT = MVT::v4i32; break; 5157 case MVT::v16i8: NewVT = MVT::v4i32; break; 5158 } 5159 5160 int Scale = NumElems / NewWidth; 5161 SmallVector<int, 8> MaskVec; 5162 for (unsigned i = 0; i < NumElems; i += Scale) { 5163 int StartIdx = -1; 5164 for (int j = 0; j < Scale; ++j) { 5165 int EltIdx = SVOp->getMaskElt(i+j); 5166 if (EltIdx < 0) 5167 continue; 5168 if (StartIdx == -1) 5169 StartIdx = EltIdx - (EltIdx % Scale); 5170 if (EltIdx != StartIdx + j) 5171 return SDValue(); 5172 } 5173 if (StartIdx == -1) 5174 MaskVec.push_back(-1); 5175 else 5176 MaskVec.push_back(StartIdx / Scale); 5177 } 5178 5179 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5180 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5181 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5182} 5183 5184/// getVZextMovL - Return a zero-extending vector move low node. 5185/// 5186static SDValue getVZextMovL(EVT VT, EVT OpVT, 5187 SDValue SrcOp, SelectionDAG &DAG, 5188 const X86Subtarget *Subtarget, DebugLoc dl) { 5189 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5190 LoadSDNode *LD = NULL; 5191 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5192 LD = dyn_cast<LoadSDNode>(SrcOp); 5193 if (!LD) { 5194 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5195 // instead. 5196 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5197 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5198 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5199 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5200 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5201 // PR2108 5202 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5203 return DAG.getNode(ISD::BITCAST, dl, VT, 5204 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5205 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5206 OpVT, 5207 SrcOp.getOperand(0) 5208 .getOperand(0)))); 5209 } 5210 } 5211 } 5212 5213 return DAG.getNode(ISD::BITCAST, dl, VT, 5214 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5215 DAG.getNode(ISD::BITCAST, dl, 5216 OpVT, SrcOp))); 5217} 5218 5219/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 5220/// shuffles. 5221static SDValue 5222LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5223 SDValue V1 = SVOp->getOperand(0); 5224 SDValue V2 = SVOp->getOperand(1); 5225 DebugLoc dl = SVOp->getDebugLoc(); 5226 EVT VT = SVOp->getValueType(0); 5227 5228 SmallVector<std::pair<int, int>, 8> Locs; 5229 Locs.resize(4); 5230 SmallVector<int, 8> Mask1(4U, -1); 5231 SmallVector<int, 8> PermMask; 5232 SVOp->getMask(PermMask); 5233 5234 unsigned NumHi = 0; 5235 unsigned NumLo = 0; 5236 for (unsigned i = 0; i != 4; ++i) { 5237 int Idx = PermMask[i]; 5238 if (Idx < 0) { 5239 Locs[i] = std::make_pair(-1, -1); 5240 } else { 5241 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5242 if (Idx < 4) { 5243 Locs[i] = std::make_pair(0, NumLo); 5244 Mask1[NumLo] = Idx; 5245 NumLo++; 5246 } else { 5247 Locs[i] = std::make_pair(1, NumHi); 5248 if (2+NumHi < 4) 5249 Mask1[2+NumHi] = Idx; 5250 NumHi++; 5251 } 5252 } 5253 } 5254 5255 if (NumLo <= 2 && NumHi <= 2) { 5256 // If no more than two elements come from either vector. This can be 5257 // implemented with two shuffles. First shuffle gather the elements. 5258 // The second shuffle, which takes the first shuffle as both of its 5259 // vector operands, put the elements into the right order. 5260 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5261 5262 SmallVector<int, 8> Mask2(4U, -1); 5263 5264 for (unsigned i = 0; i != 4; ++i) { 5265 if (Locs[i].first == -1) 5266 continue; 5267 else { 5268 unsigned Idx = (i < 2) ? 0 : 4; 5269 Idx += Locs[i].first * 2 + Locs[i].second; 5270 Mask2[i] = Idx; 5271 } 5272 } 5273 5274 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5275 } else if (NumLo == 3 || NumHi == 3) { 5276 // Otherwise, we must have three elements from one vector, call it X, and 5277 // one element from the other, call it Y. First, use a shufps to build an 5278 // intermediate vector with the one element from Y and the element from X 5279 // that will be in the same half in the final destination (the indexes don't 5280 // matter). Then, use a shufps to build the final vector, taking the half 5281 // containing the element from Y from the intermediate, and the other half 5282 // from X. 5283 if (NumHi == 3) { 5284 // Normalize it so the 3 elements come from V1. 5285 CommuteVectorShuffleMask(PermMask, VT); 5286 std::swap(V1, V2); 5287 } 5288 5289 // Find the element from V2. 5290 unsigned HiIndex; 5291 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5292 int Val = PermMask[HiIndex]; 5293 if (Val < 0) 5294 continue; 5295 if (Val >= 4) 5296 break; 5297 } 5298 5299 Mask1[0] = PermMask[HiIndex]; 5300 Mask1[1] = -1; 5301 Mask1[2] = PermMask[HiIndex^1]; 5302 Mask1[3] = -1; 5303 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5304 5305 if (HiIndex >= 2) { 5306 Mask1[0] = PermMask[0]; 5307 Mask1[1] = PermMask[1]; 5308 Mask1[2] = HiIndex & 1 ? 6 : 4; 5309 Mask1[3] = HiIndex & 1 ? 4 : 6; 5310 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5311 } else { 5312 Mask1[0] = HiIndex & 1 ? 2 : 0; 5313 Mask1[1] = HiIndex & 1 ? 0 : 2; 5314 Mask1[2] = PermMask[2]; 5315 Mask1[3] = PermMask[3]; 5316 if (Mask1[2] >= 0) 5317 Mask1[2] += 4; 5318 if (Mask1[3] >= 0) 5319 Mask1[3] += 4; 5320 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5321 } 5322 } 5323 5324 // Break it into (shuffle shuffle_hi, shuffle_lo). 5325 Locs.clear(); 5326 Locs.resize(4); 5327 SmallVector<int,8> LoMask(4U, -1); 5328 SmallVector<int,8> HiMask(4U, -1); 5329 5330 SmallVector<int,8> *MaskPtr = &LoMask; 5331 unsigned MaskIdx = 0; 5332 unsigned LoIdx = 0; 5333 unsigned HiIdx = 2; 5334 for (unsigned i = 0; i != 4; ++i) { 5335 if (i == 2) { 5336 MaskPtr = &HiMask; 5337 MaskIdx = 1; 5338 LoIdx = 0; 5339 HiIdx = 2; 5340 } 5341 int Idx = PermMask[i]; 5342 if (Idx < 0) { 5343 Locs[i] = std::make_pair(-1, -1); 5344 } else if (Idx < 4) { 5345 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5346 (*MaskPtr)[LoIdx] = Idx; 5347 LoIdx++; 5348 } else { 5349 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5350 (*MaskPtr)[HiIdx] = Idx; 5351 HiIdx++; 5352 } 5353 } 5354 5355 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5356 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5357 SmallVector<int, 8> MaskOps; 5358 for (unsigned i = 0; i != 4; ++i) { 5359 if (Locs[i].first == -1) { 5360 MaskOps.push_back(-1); 5361 } else { 5362 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5363 MaskOps.push_back(Idx); 5364 } 5365 } 5366 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5367} 5368 5369static bool MayFoldVectorLoad(SDValue V) { 5370 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5371 V = V.getOperand(0); 5372 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5373 V = V.getOperand(0); 5374 if (MayFoldLoad(V)) 5375 return true; 5376 return false; 5377} 5378 5379// FIXME: the version above should always be used. Since there's 5380// a bug where several vector shuffles can't be folded because the 5381// DAG is not updated during lowering and a node claims to have two 5382// uses while it only has one, use this version, and let isel match 5383// another instruction if the load really happens to have more than 5384// one use. Remove this version after this bug get fixed. 5385// rdar://8434668, PR8156 5386static bool RelaxedMayFoldVectorLoad(SDValue V) { 5387 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5388 V = V.getOperand(0); 5389 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5390 V = V.getOperand(0); 5391 if (ISD::isNormalLoad(V.getNode())) 5392 return true; 5393 return false; 5394} 5395 5396/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5397/// a vector extract, and if both can be later optimized into a single load. 5398/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5399/// here because otherwise a target specific shuffle node is going to be 5400/// emitted for this shuffle, and the optimization not done. 5401/// FIXME: This is probably not the best approach, but fix the problem 5402/// until the right path is decided. 5403static 5404bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5405 const TargetLowering &TLI) { 5406 EVT VT = V.getValueType(); 5407 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5408 5409 // Be sure that the vector shuffle is present in a pattern like this: 5410 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5411 if (!V.hasOneUse()) 5412 return false; 5413 5414 SDNode *N = *V.getNode()->use_begin(); 5415 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5416 return false; 5417 5418 SDValue EltNo = N->getOperand(1); 5419 if (!isa<ConstantSDNode>(EltNo)) 5420 return false; 5421 5422 // If the bit convert changed the number of elements, it is unsafe 5423 // to examine the mask. 5424 bool HasShuffleIntoBitcast = false; 5425 if (V.getOpcode() == ISD::BITCAST) { 5426 EVT SrcVT = V.getOperand(0).getValueType(); 5427 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5428 return false; 5429 V = V.getOperand(0); 5430 HasShuffleIntoBitcast = true; 5431 } 5432 5433 // Select the input vector, guarding against out of range extract vector. 5434 unsigned NumElems = VT.getVectorNumElements(); 5435 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5436 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5437 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5438 5439 // Skip one more bit_convert if necessary 5440 if (V.getOpcode() == ISD::BITCAST) 5441 V = V.getOperand(0); 5442 5443 if (ISD::isNormalLoad(V.getNode())) { 5444 // Is the original load suitable? 5445 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5446 5447 // FIXME: avoid the multi-use bug that is preventing lots of 5448 // of foldings to be detected, this is still wrong of course, but 5449 // give the temporary desired behavior, and if it happens that 5450 // the load has real more uses, during isel it will not fold, and 5451 // will generate poor code. 5452 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5453 return false; 5454 5455 if (!HasShuffleIntoBitcast) 5456 return true; 5457 5458 // If there's a bitcast before the shuffle, check if the load type and 5459 // alignment is valid. 5460 unsigned Align = LN0->getAlignment(); 5461 unsigned NewAlign = 5462 TLI.getTargetData()->getABITypeAlignment( 5463 VT.getTypeForEVT(*DAG.getContext())); 5464 5465 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5466 return false; 5467 } 5468 5469 return true; 5470} 5471 5472static 5473SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5474 EVT VT = Op.getValueType(); 5475 5476 // Canonizalize to v2f64. 5477 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5478 return DAG.getNode(ISD::BITCAST, dl, VT, 5479 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5480 V1, DAG)); 5481} 5482 5483static 5484SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5485 bool HasSSE2) { 5486 SDValue V1 = Op.getOperand(0); 5487 SDValue V2 = Op.getOperand(1); 5488 EVT VT = Op.getValueType(); 5489 5490 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5491 5492 if (HasSSE2 && VT == MVT::v2f64) 5493 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5494 5495 // v4f32 or v4i32 5496 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5497} 5498 5499static 5500SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5501 SDValue V1 = Op.getOperand(0); 5502 SDValue V2 = Op.getOperand(1); 5503 EVT VT = Op.getValueType(); 5504 5505 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5506 "unsupported shuffle type"); 5507 5508 if (V2.getOpcode() == ISD::UNDEF) 5509 V2 = V1; 5510 5511 // v4i32 or v4f32 5512 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5513} 5514 5515static 5516SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5517 SDValue V1 = Op.getOperand(0); 5518 SDValue V2 = Op.getOperand(1); 5519 EVT VT = Op.getValueType(); 5520 unsigned NumElems = VT.getVectorNumElements(); 5521 5522 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5523 // operand of these instructions is only memory, so check if there's a 5524 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5525 // same masks. 5526 bool CanFoldLoad = false; 5527 5528 // Trivial case, when V2 comes from a load. 5529 if (MayFoldVectorLoad(V2)) 5530 CanFoldLoad = true; 5531 5532 // When V1 is a load, it can be folded later into a store in isel, example: 5533 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5534 // turns into: 5535 // (MOVLPSmr addr:$src1, VR128:$src2) 5536 // So, recognize this potential and also use MOVLPS or MOVLPD 5537 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5538 CanFoldLoad = true; 5539 5540 // Both of them can't be memory operations though. 5541 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 5542 CanFoldLoad = false; 5543 5544 if (CanFoldLoad) { 5545 if (HasSSE2 && NumElems == 2) 5546 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5547 5548 if (NumElems == 4) 5549 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5550 } 5551 5552 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5553 // movl and movlp will both match v2i64, but v2i64 is never matched by 5554 // movl earlier because we make it strict to avoid messing with the movlp load 5555 // folding logic (see the code above getMOVLP call). Match it here then, 5556 // this is horrible, but will stay like this until we move all shuffle 5557 // matching to x86 specific nodes. Note that for the 1st condition all 5558 // types are matched with movsd. 5559 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5560 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5561 else if (HasSSE2) 5562 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5563 5564 5565 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5566 5567 // Invert the operand order and use SHUFPS to match it. 5568 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5569 X86::getShuffleSHUFImmediate(SVOp), DAG); 5570} 5571 5572static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) { 5573 switch(VT.getSimpleVT().SimpleTy) { 5574 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5575 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5576 case MVT::v4f32: 5577 return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS; 5578 case MVT::v2f64: 5579 return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; 5580 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 5581 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 5582 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5583 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5584 default: 5585 llvm_unreachable("Unknown type for unpckl"); 5586 } 5587 return 0; 5588} 5589 5590static inline unsigned getUNPCKHOpcode(EVT VT) { 5591 switch(VT.getSimpleVT().SimpleTy) { 5592 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5593 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5594 case MVT::v4f32: return X86ISD::UNPCKHPS; 5595 case MVT::v2f64: return X86ISD::UNPCKHPD; 5596 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5597 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5598 default: 5599 llvm_unreachable("Unknown type for unpckh"); 5600 } 5601 return 0; 5602} 5603 5604static 5605SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5606 const TargetLowering &TLI, 5607 const X86Subtarget *Subtarget) { 5608 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5609 EVT VT = Op.getValueType(); 5610 DebugLoc dl = Op.getDebugLoc(); 5611 SDValue V1 = Op.getOperand(0); 5612 SDValue V2 = Op.getOperand(1); 5613 5614 if (isZeroShuffle(SVOp)) 5615 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5616 5617 // Handle splat operations 5618 if (SVOp->isSplat()) { 5619 // Special case, this is the only place now where it's 5620 // allowed to return a vector_shuffle operation without 5621 // using a target specific node, because *hopefully* it 5622 // will be optimized away by the dag combiner. 5623 if (VT.getVectorNumElements() <= 4 && 5624 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5625 return Op; 5626 5627 // Handle splats by matching through known masks 5628 if (VT.getVectorNumElements() <= 4) 5629 return SDValue(); 5630 5631 // Canonicalize all of the remaining to v4f32. 5632 return PromoteSplat(SVOp, DAG); 5633 } 5634 5635 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5636 // do it! 5637 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5638 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5639 if (NewOp.getNode()) 5640 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5641 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5642 // FIXME: Figure out a cleaner way to do this. 5643 // Try to make use of movq to zero out the top part. 5644 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5645 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5646 if (NewOp.getNode()) { 5647 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5648 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5649 DAG, Subtarget, dl); 5650 } 5651 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5652 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5653 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5654 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5655 DAG, Subtarget, dl); 5656 } 5657 } 5658 return SDValue(); 5659} 5660 5661SDValue 5662X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5663 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5664 SDValue V1 = Op.getOperand(0); 5665 SDValue V2 = Op.getOperand(1); 5666 EVT VT = Op.getValueType(); 5667 DebugLoc dl = Op.getDebugLoc(); 5668 unsigned NumElems = VT.getVectorNumElements(); 5669 bool isMMX = VT.getSizeInBits() == 64; 5670 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5671 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5672 bool V1IsSplat = false; 5673 bool V2IsSplat = false; 5674 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5675 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5676 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5677 MachineFunction &MF = DAG.getMachineFunction(); 5678 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5679 5680 // Shuffle operations on MMX not supported. 5681 if (isMMX) 5682 return Op; 5683 5684 // Vector shuffle lowering takes 3 steps: 5685 // 5686 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5687 // narrowing and commutation of operands should be handled. 5688 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5689 // shuffle nodes. 5690 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5691 // so the shuffle can be broken into other shuffles and the legalizer can 5692 // try the lowering again. 5693 // 5694 // The general ideia is that no vector_shuffle operation should be left to 5695 // be matched during isel, all of them must be converted to a target specific 5696 // node here. 5697 5698 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5699 // narrowing and commutation of operands should be handled. The actual code 5700 // doesn't include all of those, work in progress... 5701 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5702 if (NewOp.getNode()) 5703 return NewOp; 5704 5705 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5706 // unpckh_undef). Only use pshufd if speed is more important than size. 5707 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5708 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5709 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG); 5710 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5711 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5712 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5713 5714 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5715 RelaxedMayFoldVectorLoad(V1)) 5716 return getMOVDDup(Op, dl, V1, DAG); 5717 5718 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5719 return getMOVHighToLow(Op, dl, DAG); 5720 5721 // Use to match splats 5722 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5723 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5724 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5725 5726 if (X86::isPSHUFDMask(SVOp)) { 5727 // The actual implementation will match the mask in the if above and then 5728 // during isel it can match several different instructions, not only pshufd 5729 // as its name says, sad but true, emulate the behavior for now... 5730 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5731 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5732 5733 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5734 5735 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5736 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5737 5738 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5739 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5740 TargetMask, DAG); 5741 5742 if (VT == MVT::v4f32) 5743 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5744 TargetMask, DAG); 5745 } 5746 5747 // Check if this can be converted into a logical shift. 5748 bool isLeft = false; 5749 unsigned ShAmt = 0; 5750 SDValue ShVal; 5751 bool isShift = getSubtarget()->hasSSE2() && 5752 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5753 if (isShift && ShVal.hasOneUse()) { 5754 // If the shifted value has multiple uses, it may be cheaper to use 5755 // v_set0 + movlhps or movhlps, etc. 5756 EVT EltVT = VT.getVectorElementType(); 5757 ShAmt *= EltVT.getSizeInBits(); 5758 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5759 } 5760 5761 if (X86::isMOVLMask(SVOp)) { 5762 if (V1IsUndef) 5763 return V2; 5764 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5765 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5766 if (!X86::isMOVLPMask(SVOp)) { 5767 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5768 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5769 5770 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5771 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5772 } 5773 } 5774 5775 // FIXME: fold these into legal mask. 5776 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5777 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5778 5779 if (X86::isMOVHLPSMask(SVOp)) 5780 return getMOVHighToLow(Op, dl, DAG); 5781 5782 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5783 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5784 5785 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5786 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5787 5788 if (X86::isMOVLPMask(SVOp)) 5789 return getMOVLP(Op, dl, DAG, HasSSE2); 5790 5791 if (ShouldXformToMOVHLPS(SVOp) || 5792 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5793 return CommuteVectorShuffle(SVOp, DAG); 5794 5795 if (isShift) { 5796 // No better options. Use a vshl / vsrl. 5797 EVT EltVT = VT.getVectorElementType(); 5798 ShAmt *= EltVT.getSizeInBits(); 5799 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5800 } 5801 5802 bool Commuted = false; 5803 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5804 // 1,1,1,1 -> v8i16 though. 5805 V1IsSplat = isSplatVector(V1.getNode()); 5806 V2IsSplat = isSplatVector(V2.getNode()); 5807 5808 // Canonicalize the splat or undef, if present, to be on the RHS. 5809 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5810 Op = CommuteVectorShuffle(SVOp, DAG); 5811 SVOp = cast<ShuffleVectorSDNode>(Op); 5812 V1 = SVOp->getOperand(0); 5813 V2 = SVOp->getOperand(1); 5814 std::swap(V1IsSplat, V2IsSplat); 5815 std::swap(V1IsUndef, V2IsUndef); 5816 Commuted = true; 5817 } 5818 5819 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5820 // Shuffling low element of v1 into undef, just return v1. 5821 if (V2IsUndef) 5822 return V1; 5823 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5824 // the instruction selector will not match, so get a canonical MOVL with 5825 // swapped operands to undo the commute. 5826 return getMOVL(DAG, dl, VT, V2, V1); 5827 } 5828 5829 if (X86::isUNPCKLMask(SVOp)) 5830 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5831 dl, VT, V1, V2, DAG); 5832 5833 if (X86::isUNPCKHMask(SVOp)) 5834 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5835 5836 if (V2IsSplat) { 5837 // Normalize mask so all entries that point to V2 points to its first 5838 // element then try to match unpck{h|l} again. If match, return a 5839 // new vector_shuffle with the corrected mask. 5840 SDValue NewMask = NormalizeMask(SVOp, DAG); 5841 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5842 if (NSVOp != SVOp) { 5843 if (X86::isUNPCKLMask(NSVOp, true)) { 5844 return NewMask; 5845 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5846 return NewMask; 5847 } 5848 } 5849 } 5850 5851 if (Commuted) { 5852 // Commute is back and try unpck* again. 5853 // FIXME: this seems wrong. 5854 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5855 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5856 5857 if (X86::isUNPCKLMask(NewSVOp)) 5858 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5859 dl, VT, V2, V1, DAG); 5860 5861 if (X86::isUNPCKHMask(NewSVOp)) 5862 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5863 } 5864 5865 // Normalize the node to match x86 shuffle ops if needed 5866 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5867 return CommuteVectorShuffle(SVOp, DAG); 5868 5869 // The checks below are all present in isShuffleMaskLegal, but they are 5870 // inlined here right now to enable us to directly emit target specific 5871 // nodes, and remove one by one until they don't return Op anymore. 5872 SmallVector<int, 16> M; 5873 SVOp->getMask(M); 5874 5875 if (isPALIGNRMask(M, VT, HasSSSE3)) 5876 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5877 X86::getShufflePALIGNRImmediate(SVOp), 5878 DAG); 5879 5880 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5881 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5882 if (VT == MVT::v2f64) { 5883 X86ISD::NodeType Opcode = 5884 getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; 5885 return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG); 5886 } 5887 if (VT == MVT::v2i64) 5888 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5889 } 5890 5891 if (isPSHUFHWMask(M, VT)) 5892 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5893 X86::getShufflePSHUFHWImmediate(SVOp), 5894 DAG); 5895 5896 if (isPSHUFLWMask(M, VT)) 5897 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5898 X86::getShufflePSHUFLWImmediate(SVOp), 5899 DAG); 5900 5901 if (isSHUFPMask(M, VT)) { 5902 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5903 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5904 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5905 TargetMask, DAG); 5906 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5907 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5908 TargetMask, DAG); 5909 } 5910 5911 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5912 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5913 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5914 dl, VT, V1, V1, DAG); 5915 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5916 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5917 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5918 5919 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5920 if (VT == MVT::v8i16) { 5921 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5922 if (NewOp.getNode()) 5923 return NewOp; 5924 } 5925 5926 if (VT == MVT::v16i8) { 5927 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5928 if (NewOp.getNode()) 5929 return NewOp; 5930 } 5931 5932 // Handle all 4 wide cases with a number of shuffles. 5933 if (NumElems == 4) 5934 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5935 5936 return SDValue(); 5937} 5938 5939SDValue 5940X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5941 SelectionDAG &DAG) const { 5942 EVT VT = Op.getValueType(); 5943 DebugLoc dl = Op.getDebugLoc(); 5944 if (VT.getSizeInBits() == 8) { 5945 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5946 Op.getOperand(0), Op.getOperand(1)); 5947 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5948 DAG.getValueType(VT)); 5949 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5950 } else if (VT.getSizeInBits() == 16) { 5951 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5952 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5953 if (Idx == 0) 5954 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5955 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5956 DAG.getNode(ISD::BITCAST, dl, 5957 MVT::v4i32, 5958 Op.getOperand(0)), 5959 Op.getOperand(1))); 5960 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5961 Op.getOperand(0), Op.getOperand(1)); 5962 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5963 DAG.getValueType(VT)); 5964 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5965 } else if (VT == MVT::f32) { 5966 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5967 // the result back to FR32 register. It's only worth matching if the 5968 // result has a single use which is a store or a bitcast to i32. And in 5969 // the case of a store, it's not worth it if the index is a constant 0, 5970 // because a MOVSSmr can be used instead, which is smaller and faster. 5971 if (!Op.hasOneUse()) 5972 return SDValue(); 5973 SDNode *User = *Op.getNode()->use_begin(); 5974 if ((User->getOpcode() != ISD::STORE || 5975 (isa<ConstantSDNode>(Op.getOperand(1)) && 5976 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5977 (User->getOpcode() != ISD::BITCAST || 5978 User->getValueType(0) != MVT::i32)) 5979 return SDValue(); 5980 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5981 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 5982 Op.getOperand(0)), 5983 Op.getOperand(1)); 5984 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 5985 } else if (VT == MVT::i32) { 5986 // ExtractPS works with constant index. 5987 if (isa<ConstantSDNode>(Op.getOperand(1))) 5988 return Op; 5989 } 5990 return SDValue(); 5991} 5992 5993 5994SDValue 5995X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5996 SelectionDAG &DAG) const { 5997 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5998 return SDValue(); 5999 6000 SDValue Vec = Op.getOperand(0); 6001 EVT VecVT = Vec.getValueType(); 6002 6003 // If this is a 256-bit vector result, first extract the 128-bit 6004 // vector and then extract from the 128-bit vector. 6005 if (VecVT.getSizeInBits() > 128) { 6006 DebugLoc dl = Op.getNode()->getDebugLoc(); 6007 unsigned NumElems = VecVT.getVectorNumElements(); 6008 SDValue Idx = Op.getOperand(1); 6009 6010 if (!isa<ConstantSDNode>(Idx)) 6011 return SDValue(); 6012 6013 unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128); 6014 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6015 6016 // Get the 128-bit vector. 6017 bool Upper = IdxVal >= ExtractNumElems; 6018 Vec = Extract128BitVector(Vec, Idx, DAG, dl); 6019 6020 // Extract from it. 6021 SDValue ScaledIdx = Idx; 6022 if (Upper) 6023 ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx, 6024 DAG.getConstant(ExtractNumElems, 6025 Idx.getValueType())); 6026 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6027 ScaledIdx); 6028 } 6029 6030 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6031 6032 if (Subtarget->hasSSE41()) { 6033 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6034 if (Res.getNode()) 6035 return Res; 6036 } 6037 6038 EVT VT = Op.getValueType(); 6039 DebugLoc dl = Op.getDebugLoc(); 6040 // TODO: handle v16i8. 6041 if (VT.getSizeInBits() == 16) { 6042 SDValue Vec = Op.getOperand(0); 6043 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6044 if (Idx == 0) 6045 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6046 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6047 DAG.getNode(ISD::BITCAST, dl, 6048 MVT::v4i32, Vec), 6049 Op.getOperand(1))); 6050 // Transform it so it match pextrw which produces a 32-bit result. 6051 EVT EltVT = MVT::i32; 6052 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6053 Op.getOperand(0), Op.getOperand(1)); 6054 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6055 DAG.getValueType(VT)); 6056 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6057 } else if (VT.getSizeInBits() == 32) { 6058 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6059 if (Idx == 0) 6060 return Op; 6061 6062 // SHUFPS the element to the lowest double word, then movss. 6063 int Mask[4] = { Idx, -1, -1, -1 }; 6064 EVT VVT = Op.getOperand(0).getValueType(); 6065 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6066 DAG.getUNDEF(VVT), Mask); 6067 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6068 DAG.getIntPtrConstant(0)); 6069 } else if (VT.getSizeInBits() == 64) { 6070 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6071 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6072 // to match extract_elt for f64. 6073 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6074 if (Idx == 0) 6075 return Op; 6076 6077 // UNPCKHPD the element to the lowest double word, then movsd. 6078 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6079 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6080 int Mask[2] = { 1, -1 }; 6081 EVT VVT = Op.getOperand(0).getValueType(); 6082 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6083 DAG.getUNDEF(VVT), Mask); 6084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6085 DAG.getIntPtrConstant(0)); 6086 } 6087 6088 return SDValue(); 6089} 6090 6091SDValue 6092X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6093 SelectionDAG &DAG) const { 6094 EVT VT = Op.getValueType(); 6095 EVT EltVT = VT.getVectorElementType(); 6096 DebugLoc dl = Op.getDebugLoc(); 6097 6098 SDValue N0 = Op.getOperand(0); 6099 SDValue N1 = Op.getOperand(1); 6100 SDValue N2 = Op.getOperand(2); 6101 6102 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6103 isa<ConstantSDNode>(N2)) { 6104 unsigned Opc; 6105 if (VT == MVT::v8i16) 6106 Opc = X86ISD::PINSRW; 6107 else if (VT == MVT::v16i8) 6108 Opc = X86ISD::PINSRB; 6109 else 6110 Opc = X86ISD::PINSRB; 6111 6112 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6113 // argument. 6114 if (N1.getValueType() != MVT::i32) 6115 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6116 if (N2.getValueType() != MVT::i32) 6117 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6118 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6119 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6120 // Bits [7:6] of the constant are the source select. This will always be 6121 // zero here. The DAG Combiner may combine an extract_elt index into these 6122 // bits. For example (insert (extract, 3), 2) could be matched by putting 6123 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6124 // Bits [5:4] of the constant are the destination select. This is the 6125 // value of the incoming immediate. 6126 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6127 // combine either bitwise AND or insert of float 0.0 to set these bits. 6128 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6129 // Create this as a scalar to vector.. 6130 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6131 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6132 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6133 // PINSR* works with constant index. 6134 return Op; 6135 } 6136 return SDValue(); 6137} 6138 6139SDValue 6140X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6141 EVT VT = Op.getValueType(); 6142 EVT EltVT = VT.getVectorElementType(); 6143 6144 DebugLoc dl = Op.getDebugLoc(); 6145 SDValue N0 = Op.getOperand(0); 6146 SDValue N1 = Op.getOperand(1); 6147 SDValue N2 = Op.getOperand(2); 6148 6149 // If this is a 256-bit vector result, first insert into a 128-bit 6150 // vector and then insert into the 256-bit vector. 6151 if (VT.getSizeInBits() > 128) { 6152 if (!isa<ConstantSDNode>(N2)) 6153 return SDValue(); 6154 6155 // Get the 128-bit vector. 6156 unsigned NumElems = VT.getVectorNumElements(); 6157 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6158 bool Upper = IdxVal >= NumElems / 2; 6159 6160 SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl); 6161 6162 // Insert into it. 6163 SDValue ScaledN2 = N2; 6164 if (Upper) 6165 ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2, 6166 DAG.getConstant(NumElems / 6167 (VT.getSizeInBits() / 128), 6168 N2.getValueType())); 6169 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0, 6170 N1, ScaledN2); 6171 6172 // Insert the 128-bit vector 6173 // FIXME: Why UNDEF? 6174 return Insert128BitVector(N0, Op, N2, DAG, dl); 6175 } 6176 6177 if (Subtarget->hasSSE41()) 6178 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6179 6180 if (EltVT == MVT::i8) 6181 return SDValue(); 6182 6183 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6184 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6185 // as its second argument. 6186 if (N1.getValueType() != MVT::i32) 6187 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6188 if (N2.getValueType() != MVT::i32) 6189 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6190 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6191 } 6192 return SDValue(); 6193} 6194 6195SDValue 6196X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6197 LLVMContext *Context = DAG.getContext(); 6198 DebugLoc dl = Op.getDebugLoc(); 6199 EVT OpVT = Op.getValueType(); 6200 6201 // If this is a 256-bit vector result, first insert into a 128-bit 6202 // vector and then insert into the 256-bit vector. 6203 if (OpVT.getSizeInBits() > 128) { 6204 // Insert into a 128-bit vector. 6205 EVT VT128 = EVT::getVectorVT(*Context, 6206 OpVT.getVectorElementType(), 6207 OpVT.getVectorNumElements() / 2); 6208 6209 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6210 6211 // Insert the 128-bit vector. 6212 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6213 DAG.getConstant(0, MVT::i32), 6214 DAG, dl); 6215 } 6216 6217 if (Op.getValueType() == MVT::v1i64 && 6218 Op.getOperand(0).getValueType() == MVT::i64) 6219 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6220 6221 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6222 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6223 "Expected an SSE type!"); 6224 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6225 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6226} 6227 6228// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6229// a simple subregister reference or explicit instructions to grab 6230// upper bits of a vector. 6231SDValue 6232X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6233 if (Subtarget->hasAVX()) { 6234 DebugLoc dl = Op.getNode()->getDebugLoc(); 6235 SDValue Vec = Op.getNode()->getOperand(0); 6236 SDValue Idx = Op.getNode()->getOperand(1); 6237 6238 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6239 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6240 return Extract128BitVector(Vec, Idx, DAG, dl); 6241 } 6242 } 6243 return SDValue(); 6244} 6245 6246// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6247// simple superregister reference or explicit instructions to insert 6248// the upper bits of a vector. 6249SDValue 6250X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6251 if (Subtarget->hasAVX()) { 6252 DebugLoc dl = Op.getNode()->getDebugLoc(); 6253 SDValue Vec = Op.getNode()->getOperand(0); 6254 SDValue SubVec = Op.getNode()->getOperand(1); 6255 SDValue Idx = Op.getNode()->getOperand(2); 6256 6257 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6258 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6259 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6260 } 6261 } 6262 return SDValue(); 6263} 6264 6265// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6266// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6267// one of the above mentioned nodes. It has to be wrapped because otherwise 6268// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6269// be used to form addressing mode. These wrapped nodes will be selected 6270// into MOV32ri. 6271SDValue 6272X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6273 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6274 6275 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6276 // global base reg. 6277 unsigned char OpFlag = 0; 6278 unsigned WrapperKind = X86ISD::Wrapper; 6279 CodeModel::Model M = getTargetMachine().getCodeModel(); 6280 6281 if (Subtarget->isPICStyleRIPRel() && 6282 (M == CodeModel::Small || M == CodeModel::Kernel)) 6283 WrapperKind = X86ISD::WrapperRIP; 6284 else if (Subtarget->isPICStyleGOT()) 6285 OpFlag = X86II::MO_GOTOFF; 6286 else if (Subtarget->isPICStyleStubPIC()) 6287 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6288 6289 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6290 CP->getAlignment(), 6291 CP->getOffset(), OpFlag); 6292 DebugLoc DL = CP->getDebugLoc(); 6293 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6294 // With PIC, the address is actually $g + Offset. 6295 if (OpFlag) { 6296 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6297 DAG.getNode(X86ISD::GlobalBaseReg, 6298 DebugLoc(), getPointerTy()), 6299 Result); 6300 } 6301 6302 return Result; 6303} 6304 6305SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6306 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6307 6308 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6309 // global base reg. 6310 unsigned char OpFlag = 0; 6311 unsigned WrapperKind = X86ISD::Wrapper; 6312 CodeModel::Model M = getTargetMachine().getCodeModel(); 6313 6314 if (Subtarget->isPICStyleRIPRel() && 6315 (M == CodeModel::Small || M == CodeModel::Kernel)) 6316 WrapperKind = X86ISD::WrapperRIP; 6317 else if (Subtarget->isPICStyleGOT()) 6318 OpFlag = X86II::MO_GOTOFF; 6319 else if (Subtarget->isPICStyleStubPIC()) 6320 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6321 6322 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6323 OpFlag); 6324 DebugLoc DL = JT->getDebugLoc(); 6325 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6326 6327 // With PIC, the address is actually $g + Offset. 6328 if (OpFlag) 6329 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6330 DAG.getNode(X86ISD::GlobalBaseReg, 6331 DebugLoc(), getPointerTy()), 6332 Result); 6333 6334 return Result; 6335} 6336 6337SDValue 6338X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6339 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6340 6341 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6342 // global base reg. 6343 unsigned char OpFlag = 0; 6344 unsigned WrapperKind = X86ISD::Wrapper; 6345 CodeModel::Model M = getTargetMachine().getCodeModel(); 6346 6347 if (Subtarget->isPICStyleRIPRel() && 6348 (M == CodeModel::Small || M == CodeModel::Kernel)) 6349 WrapperKind = X86ISD::WrapperRIP; 6350 else if (Subtarget->isPICStyleGOT()) 6351 OpFlag = X86II::MO_GOTOFF; 6352 else if (Subtarget->isPICStyleStubPIC()) 6353 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6354 6355 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6356 6357 DebugLoc DL = Op.getDebugLoc(); 6358 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6359 6360 6361 // With PIC, the address is actually $g + Offset. 6362 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6363 !Subtarget->is64Bit()) { 6364 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6365 DAG.getNode(X86ISD::GlobalBaseReg, 6366 DebugLoc(), getPointerTy()), 6367 Result); 6368 } 6369 6370 return Result; 6371} 6372 6373SDValue 6374X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6375 // Create the TargetBlockAddressAddress node. 6376 unsigned char OpFlags = 6377 Subtarget->ClassifyBlockAddressReference(); 6378 CodeModel::Model M = getTargetMachine().getCodeModel(); 6379 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6380 DebugLoc dl = Op.getDebugLoc(); 6381 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6382 /*isTarget=*/true, OpFlags); 6383 6384 if (Subtarget->isPICStyleRIPRel() && 6385 (M == CodeModel::Small || M == CodeModel::Kernel)) 6386 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6387 else 6388 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6389 6390 // With PIC, the address is actually $g + Offset. 6391 if (isGlobalRelativeToPICBase(OpFlags)) { 6392 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6393 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6394 Result); 6395 } 6396 6397 return Result; 6398} 6399 6400SDValue 6401X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6402 int64_t Offset, 6403 SelectionDAG &DAG) const { 6404 // Create the TargetGlobalAddress node, folding in the constant 6405 // offset if it is legal. 6406 unsigned char OpFlags = 6407 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6408 CodeModel::Model M = getTargetMachine().getCodeModel(); 6409 SDValue Result; 6410 if (OpFlags == X86II::MO_NO_FLAG && 6411 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6412 // A direct static reference to a global. 6413 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6414 Offset = 0; 6415 } else { 6416 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6417 } 6418 6419 if (Subtarget->isPICStyleRIPRel() && 6420 (M == CodeModel::Small || M == CodeModel::Kernel)) 6421 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6422 else 6423 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6424 6425 // With PIC, the address is actually $g + Offset. 6426 if (isGlobalRelativeToPICBase(OpFlags)) { 6427 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6428 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6429 Result); 6430 } 6431 6432 // For globals that require a load from a stub to get the address, emit the 6433 // load. 6434 if (isGlobalStubReference(OpFlags)) 6435 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6436 MachinePointerInfo::getGOT(), false, false, 0); 6437 6438 // If there was a non-zero offset that we didn't fold, create an explicit 6439 // addition for it. 6440 if (Offset != 0) 6441 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6442 DAG.getConstant(Offset, getPointerTy())); 6443 6444 return Result; 6445} 6446 6447SDValue 6448X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6449 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6450 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6451 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6452} 6453 6454static SDValue 6455GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6456 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6457 unsigned char OperandFlags) { 6458 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6459 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6460 DebugLoc dl = GA->getDebugLoc(); 6461 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6462 GA->getValueType(0), 6463 GA->getOffset(), 6464 OperandFlags); 6465 if (InFlag) { 6466 SDValue Ops[] = { Chain, TGA, *InFlag }; 6467 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6468 } else { 6469 SDValue Ops[] = { Chain, TGA }; 6470 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6471 } 6472 6473 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6474 MFI->setAdjustsStack(true); 6475 6476 SDValue Flag = Chain.getValue(1); 6477 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6478} 6479 6480// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6481static SDValue 6482LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6483 const EVT PtrVT) { 6484 SDValue InFlag; 6485 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6486 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6487 DAG.getNode(X86ISD::GlobalBaseReg, 6488 DebugLoc(), PtrVT), InFlag); 6489 InFlag = Chain.getValue(1); 6490 6491 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6492} 6493 6494// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6495static SDValue 6496LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6497 const EVT PtrVT) { 6498 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6499 X86::RAX, X86II::MO_TLSGD); 6500} 6501 6502// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6503// "local exec" model. 6504static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6505 const EVT PtrVT, TLSModel::Model model, 6506 bool is64Bit) { 6507 DebugLoc dl = GA->getDebugLoc(); 6508 6509 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6510 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6511 is64Bit ? 257 : 256)); 6512 6513 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6514 DAG.getIntPtrConstant(0), 6515 MachinePointerInfo(Ptr), false, false, 0); 6516 6517 unsigned char OperandFlags = 0; 6518 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6519 // initialexec. 6520 unsigned WrapperKind = X86ISD::Wrapper; 6521 if (model == TLSModel::LocalExec) { 6522 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6523 } else if (is64Bit) { 6524 assert(model == TLSModel::InitialExec); 6525 OperandFlags = X86II::MO_GOTTPOFF; 6526 WrapperKind = X86ISD::WrapperRIP; 6527 } else { 6528 assert(model == TLSModel::InitialExec); 6529 OperandFlags = X86II::MO_INDNTPOFF; 6530 } 6531 6532 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6533 // exec) 6534 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6535 GA->getValueType(0), 6536 GA->getOffset(), OperandFlags); 6537 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6538 6539 if (model == TLSModel::InitialExec) 6540 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6541 MachinePointerInfo::getGOT(), false, false, 0); 6542 6543 // The address of the thread local variable is the add of the thread 6544 // pointer with the offset of the variable. 6545 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6546} 6547 6548SDValue 6549X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6550 6551 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6552 const GlobalValue *GV = GA->getGlobal(); 6553 6554 if (Subtarget->isTargetELF()) { 6555 // TODO: implement the "local dynamic" model 6556 // TODO: implement the "initial exec"model for pic executables 6557 6558 // If GV is an alias then use the aliasee for determining 6559 // thread-localness. 6560 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6561 GV = GA->resolveAliasedGlobal(false); 6562 6563 TLSModel::Model model 6564 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6565 6566 switch (model) { 6567 case TLSModel::GeneralDynamic: 6568 case TLSModel::LocalDynamic: // not implemented 6569 if (Subtarget->is64Bit()) 6570 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6571 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6572 6573 case TLSModel::InitialExec: 6574 case TLSModel::LocalExec: 6575 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6576 Subtarget->is64Bit()); 6577 } 6578 } else if (Subtarget->isTargetDarwin()) { 6579 // Darwin only has one model of TLS. Lower to that. 6580 unsigned char OpFlag = 0; 6581 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6582 X86ISD::WrapperRIP : X86ISD::Wrapper; 6583 6584 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6585 // global base reg. 6586 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6587 !Subtarget->is64Bit(); 6588 if (PIC32) 6589 OpFlag = X86II::MO_TLVP_PIC_BASE; 6590 else 6591 OpFlag = X86II::MO_TLVP; 6592 DebugLoc DL = Op.getDebugLoc(); 6593 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6594 GA->getValueType(0), 6595 GA->getOffset(), OpFlag); 6596 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6597 6598 // With PIC32, the address is actually $g + Offset. 6599 if (PIC32) 6600 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6601 DAG.getNode(X86ISD::GlobalBaseReg, 6602 DebugLoc(), getPointerTy()), 6603 Offset); 6604 6605 // Lowering the machine isd will make sure everything is in the right 6606 // location. 6607 SDValue Chain = DAG.getEntryNode(); 6608 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6609 SDValue Args[] = { Chain, Offset }; 6610 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6611 6612 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6613 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6614 MFI->setAdjustsStack(true); 6615 6616 // And our return value (tls address) is in the standard call return value 6617 // location. 6618 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6619 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6620 } 6621 6622 assert(false && 6623 "TLS not implemented for this target."); 6624 6625 llvm_unreachable("Unreachable"); 6626 return SDValue(); 6627} 6628 6629 6630/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 6631/// take a 2 x i32 value to shift plus a shift amount. 6632SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 6633 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6634 EVT VT = Op.getValueType(); 6635 unsigned VTBits = VT.getSizeInBits(); 6636 DebugLoc dl = Op.getDebugLoc(); 6637 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6638 SDValue ShOpLo = Op.getOperand(0); 6639 SDValue ShOpHi = Op.getOperand(1); 6640 SDValue ShAmt = Op.getOperand(2); 6641 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6642 DAG.getConstant(VTBits - 1, MVT::i8)) 6643 : DAG.getConstant(0, VT); 6644 6645 SDValue Tmp2, Tmp3; 6646 if (Op.getOpcode() == ISD::SHL_PARTS) { 6647 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6648 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6649 } else { 6650 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6651 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6652 } 6653 6654 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6655 DAG.getConstant(VTBits, MVT::i8)); 6656 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6657 AndNode, DAG.getConstant(0, MVT::i8)); 6658 6659 SDValue Hi, Lo; 6660 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6661 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6662 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6663 6664 if (Op.getOpcode() == ISD::SHL_PARTS) { 6665 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6666 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6667 } else { 6668 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6669 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6670 } 6671 6672 SDValue Ops[2] = { Lo, Hi }; 6673 return DAG.getMergeValues(Ops, 2, dl); 6674} 6675 6676SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6677 SelectionDAG &DAG) const { 6678 EVT SrcVT = Op.getOperand(0).getValueType(); 6679 6680 if (SrcVT.isVector()) 6681 return SDValue(); 6682 6683 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6684 "Unknown SINT_TO_FP to lower!"); 6685 6686 // These are really Legal; return the operand so the caller accepts it as 6687 // Legal. 6688 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6689 return Op; 6690 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6691 Subtarget->is64Bit()) { 6692 return Op; 6693 } 6694 6695 DebugLoc dl = Op.getDebugLoc(); 6696 unsigned Size = SrcVT.getSizeInBits()/8; 6697 MachineFunction &MF = DAG.getMachineFunction(); 6698 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6699 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6700 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6701 StackSlot, 6702 MachinePointerInfo::getFixedStack(SSFI), 6703 false, false, 0); 6704 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6705} 6706 6707SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6708 SDValue StackSlot, 6709 SelectionDAG &DAG) const { 6710 // Build the FILD 6711 DebugLoc DL = Op.getDebugLoc(); 6712 SDVTList Tys; 6713 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6714 if (useSSE) 6715 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 6716 else 6717 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6718 6719 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6720 6721 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6722 MachineMemOperand *MMO = 6723 DAG.getMachineFunction() 6724 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6725 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6726 6727 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6728 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6729 X86ISD::FILD, DL, 6730 Tys, Ops, array_lengthof(Ops), 6731 SrcVT, MMO); 6732 6733 if (useSSE) { 6734 Chain = Result.getValue(1); 6735 SDValue InFlag = Result.getValue(2); 6736 6737 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6738 // shouldn't be necessary except that RFP cannot be live across 6739 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6740 MachineFunction &MF = DAG.getMachineFunction(); 6741 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6742 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6743 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6744 Tys = DAG.getVTList(MVT::Other); 6745 SDValue Ops[] = { 6746 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6747 }; 6748 MachineMemOperand *MMO = 6749 DAG.getMachineFunction() 6750 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6751 MachineMemOperand::MOStore, SSFISize, SSFISize); 6752 6753 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6754 Ops, array_lengthof(Ops), 6755 Op.getValueType(), MMO); 6756 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6757 MachinePointerInfo::getFixedStack(SSFI), 6758 false, false, 0); 6759 } 6760 6761 return Result; 6762} 6763 6764// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6765SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6766 SelectionDAG &DAG) const { 6767 // This algorithm is not obvious. Here it is in C code, more or less: 6768 /* 6769 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6770 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6771 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6772 6773 // Copy ints to xmm registers. 6774 __m128i xh = _mm_cvtsi32_si128( hi ); 6775 __m128i xl = _mm_cvtsi32_si128( lo ); 6776 6777 // Combine into low half of a single xmm register. 6778 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6779 __m128d d; 6780 double sd; 6781 6782 // Merge in appropriate exponents to give the integer bits the right 6783 // magnitude. 6784 x = _mm_unpacklo_epi32( x, exp ); 6785 6786 // Subtract away the biases to deal with the IEEE-754 double precision 6787 // implicit 1. 6788 d = _mm_sub_pd( (__m128d) x, bias ); 6789 6790 // All conversions up to here are exact. The correctly rounded result is 6791 // calculated using the current rounding mode using the following 6792 // horizontal add. 6793 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6794 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6795 // store doesn't really need to be here (except 6796 // maybe to zero the other double) 6797 return sd; 6798 } 6799 */ 6800 6801 DebugLoc dl = Op.getDebugLoc(); 6802 LLVMContext *Context = DAG.getContext(); 6803 6804 // Build some magic constants. 6805 std::vector<Constant*> CV0; 6806 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6807 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6808 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6809 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6810 Constant *C0 = ConstantVector::get(CV0); 6811 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6812 6813 std::vector<Constant*> CV1; 6814 CV1.push_back( 6815 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6816 CV1.push_back( 6817 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6818 Constant *C1 = ConstantVector::get(CV1); 6819 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6820 6821 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6822 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6823 Op.getOperand(0), 6824 DAG.getIntPtrConstant(1))); 6825 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6826 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6827 Op.getOperand(0), 6828 DAG.getIntPtrConstant(0))); 6829 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6830 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6831 MachinePointerInfo::getConstantPool(), 6832 false, false, 16); 6833 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6834 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 6835 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6836 MachinePointerInfo::getConstantPool(), 6837 false, false, 16); 6838 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6839 6840 // Add the halves; easiest way is to swap them into another reg first. 6841 int ShufMask[2] = { 1, -1 }; 6842 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6843 DAG.getUNDEF(MVT::v2f64), ShufMask); 6844 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6845 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6846 DAG.getIntPtrConstant(0)); 6847} 6848 6849// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6850SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6851 SelectionDAG &DAG) const { 6852 DebugLoc dl = Op.getDebugLoc(); 6853 // FP constant to bias correct the final result. 6854 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6855 MVT::f64); 6856 6857 // Load the 32-bit value into an XMM register. 6858 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6859 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6860 Op.getOperand(0), 6861 DAG.getIntPtrConstant(0))); 6862 6863 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6864 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 6865 DAG.getIntPtrConstant(0)); 6866 6867 // Or the load with the bias. 6868 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6869 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6870 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6871 MVT::v2f64, Load)), 6872 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6873 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6874 MVT::v2f64, Bias))); 6875 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6876 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 6877 DAG.getIntPtrConstant(0)); 6878 6879 // Subtract the bias. 6880 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6881 6882 // Handle final rounding. 6883 EVT DestVT = Op.getValueType(); 6884 6885 if (DestVT.bitsLT(MVT::f64)) { 6886 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6887 DAG.getIntPtrConstant(0)); 6888 } else if (DestVT.bitsGT(MVT::f64)) { 6889 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6890 } 6891 6892 // Handle final rounding. 6893 return Sub; 6894} 6895 6896SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6897 SelectionDAG &DAG) const { 6898 SDValue N0 = Op.getOperand(0); 6899 DebugLoc dl = Op.getDebugLoc(); 6900 6901 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6902 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6903 // the optimization here. 6904 if (DAG.SignBitIsZero(N0)) 6905 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6906 6907 EVT SrcVT = N0.getValueType(); 6908 EVT DstVT = Op.getValueType(); 6909 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6910 return LowerUINT_TO_FP_i64(Op, DAG); 6911 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6912 return LowerUINT_TO_FP_i32(Op, DAG); 6913 6914 // Make a 64-bit buffer, and use it to build an FILD. 6915 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6916 if (SrcVT == MVT::i32) { 6917 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6918 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6919 getPointerTy(), StackSlot, WordOff); 6920 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6921 StackSlot, MachinePointerInfo(), 6922 false, false, 0); 6923 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6924 OffsetSlot, MachinePointerInfo(), 6925 false, false, 0); 6926 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6927 return Fild; 6928 } 6929 6930 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6931 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6932 StackSlot, MachinePointerInfo(), 6933 false, false, 0); 6934 // For i64 source, we need to add the appropriate power of 2 if the input 6935 // was negative. This is the same as the optimization in 6936 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6937 // we must be careful to do the computation in x87 extended precision, not 6938 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6939 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6940 MachineMemOperand *MMO = 6941 DAG.getMachineFunction() 6942 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6943 MachineMemOperand::MOLoad, 8, 8); 6944 6945 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6946 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6947 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6948 MVT::i64, MMO); 6949 6950 APInt FF(32, 0x5F800000ULL); 6951 6952 // Check whether the sign bit is set. 6953 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6954 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6955 ISD::SETLT); 6956 6957 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6958 SDValue FudgePtr = DAG.getConstantPool( 6959 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6960 getPointerTy()); 6961 6962 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6963 SDValue Zero = DAG.getIntPtrConstant(0); 6964 SDValue Four = DAG.getIntPtrConstant(4); 6965 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6966 Zero, Four); 6967 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6968 6969 // Load the value out, extending it from f32 to f80. 6970 // FIXME: Avoid the extend by constructing the right constant pool? 6971 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 6972 FudgePtr, MachinePointerInfo::getConstantPool(), 6973 MVT::f32, false, false, 4); 6974 // Extend everything to 80 bits to force it to be done on x87. 6975 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6976 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6977} 6978 6979std::pair<SDValue,SDValue> X86TargetLowering:: 6980FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6981 DebugLoc DL = Op.getDebugLoc(); 6982 6983 EVT DstTy = Op.getValueType(); 6984 6985 if (!IsSigned) { 6986 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6987 DstTy = MVT::i64; 6988 } 6989 6990 assert(DstTy.getSimpleVT() <= MVT::i64 && 6991 DstTy.getSimpleVT() >= MVT::i16 && 6992 "Unknown FP_TO_SINT to lower!"); 6993 6994 // These are really Legal. 6995 if (DstTy == MVT::i32 && 6996 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6997 return std::make_pair(SDValue(), SDValue()); 6998 if (Subtarget->is64Bit() && 6999 DstTy == MVT::i64 && 7000 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7001 return std::make_pair(SDValue(), SDValue()); 7002 7003 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7004 // stack slot. 7005 MachineFunction &MF = DAG.getMachineFunction(); 7006 unsigned MemSize = DstTy.getSizeInBits()/8; 7007 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7008 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7009 7010 7011 7012 unsigned Opc; 7013 switch (DstTy.getSimpleVT().SimpleTy) { 7014 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7015 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7016 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7017 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7018 } 7019 7020 SDValue Chain = DAG.getEntryNode(); 7021 SDValue Value = Op.getOperand(0); 7022 EVT TheVT = Op.getOperand(0).getValueType(); 7023 if (isScalarFPTypeInSSEReg(TheVT)) { 7024 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7025 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7026 MachinePointerInfo::getFixedStack(SSFI), 7027 false, false, 0); 7028 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7029 SDValue Ops[] = { 7030 Chain, StackSlot, DAG.getValueType(TheVT) 7031 }; 7032 7033 MachineMemOperand *MMO = 7034 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7035 MachineMemOperand::MOLoad, MemSize, MemSize); 7036 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7037 DstTy, MMO); 7038 Chain = Value.getValue(1); 7039 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7040 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7041 } 7042 7043 MachineMemOperand *MMO = 7044 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7045 MachineMemOperand::MOStore, MemSize, MemSize); 7046 7047 // Build the FP_TO_INT*_IN_MEM 7048 SDValue Ops[] = { Chain, Value, StackSlot }; 7049 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7050 Ops, 3, DstTy, MMO); 7051 7052 return std::make_pair(FIST, StackSlot); 7053} 7054 7055SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7056 SelectionDAG &DAG) const { 7057 if (Op.getValueType().isVector()) 7058 return SDValue(); 7059 7060 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7061 SDValue FIST = Vals.first, StackSlot = Vals.second; 7062 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7063 if (FIST.getNode() == 0) return Op; 7064 7065 // Load the result. 7066 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7067 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7068} 7069 7070SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7071 SelectionDAG &DAG) const { 7072 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7073 SDValue FIST = Vals.first, StackSlot = Vals.second; 7074 assert(FIST.getNode() && "Unexpected failure"); 7075 7076 // Load the result. 7077 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7078 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7079} 7080 7081SDValue X86TargetLowering::LowerFABS(SDValue Op, 7082 SelectionDAG &DAG) const { 7083 LLVMContext *Context = DAG.getContext(); 7084 DebugLoc dl = Op.getDebugLoc(); 7085 EVT VT = Op.getValueType(); 7086 EVT EltVT = VT; 7087 if (VT.isVector()) 7088 EltVT = VT.getVectorElementType(); 7089 std::vector<Constant*> CV; 7090 if (EltVT == MVT::f64) { 7091 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7092 CV.push_back(C); 7093 CV.push_back(C); 7094 } else { 7095 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7096 CV.push_back(C); 7097 CV.push_back(C); 7098 CV.push_back(C); 7099 CV.push_back(C); 7100 } 7101 Constant *C = ConstantVector::get(CV); 7102 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7103 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7104 MachinePointerInfo::getConstantPool(), 7105 false, false, 16); 7106 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7107} 7108 7109SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7110 LLVMContext *Context = DAG.getContext(); 7111 DebugLoc dl = Op.getDebugLoc(); 7112 EVT VT = Op.getValueType(); 7113 EVT EltVT = VT; 7114 if (VT.isVector()) 7115 EltVT = VT.getVectorElementType(); 7116 std::vector<Constant*> CV; 7117 if (EltVT == MVT::f64) { 7118 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7119 CV.push_back(C); 7120 CV.push_back(C); 7121 } else { 7122 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7123 CV.push_back(C); 7124 CV.push_back(C); 7125 CV.push_back(C); 7126 CV.push_back(C); 7127 } 7128 Constant *C = ConstantVector::get(CV); 7129 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7130 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7131 MachinePointerInfo::getConstantPool(), 7132 false, false, 16); 7133 if (VT.isVector()) { 7134 return DAG.getNode(ISD::BITCAST, dl, VT, 7135 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7136 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7137 Op.getOperand(0)), 7138 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7139 } else { 7140 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7141 } 7142} 7143 7144SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7145 LLVMContext *Context = DAG.getContext(); 7146 SDValue Op0 = Op.getOperand(0); 7147 SDValue Op1 = Op.getOperand(1); 7148 DebugLoc dl = Op.getDebugLoc(); 7149 EVT VT = Op.getValueType(); 7150 EVT SrcVT = Op1.getValueType(); 7151 7152 // If second operand is smaller, extend it first. 7153 if (SrcVT.bitsLT(VT)) { 7154 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7155 SrcVT = VT; 7156 } 7157 // And if it is bigger, shrink it first. 7158 if (SrcVT.bitsGT(VT)) { 7159 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7160 SrcVT = VT; 7161 } 7162 7163 // At this point the operands and the result should have the same 7164 // type, and that won't be f80 since that is not custom lowered. 7165 7166 // First get the sign bit of second operand. 7167 std::vector<Constant*> CV; 7168 if (SrcVT == MVT::f64) { 7169 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7170 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7171 } else { 7172 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7173 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7174 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7175 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7176 } 7177 Constant *C = ConstantVector::get(CV); 7178 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7179 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7180 MachinePointerInfo::getConstantPool(), 7181 false, false, 16); 7182 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7183 7184 // Shift sign bit right or left if the two operands have different types. 7185 if (SrcVT.bitsGT(VT)) { 7186 // Op0 is MVT::f32, Op1 is MVT::f64. 7187 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7188 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7189 DAG.getConstant(32, MVT::i32)); 7190 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7191 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7192 DAG.getIntPtrConstant(0)); 7193 } 7194 7195 // Clear first operand sign bit. 7196 CV.clear(); 7197 if (VT == MVT::f64) { 7198 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7199 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7200 } else { 7201 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7202 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7203 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7204 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7205 } 7206 C = ConstantVector::get(CV); 7207 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7208 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7209 MachinePointerInfo::getConstantPool(), 7210 false, false, 16); 7211 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7212 7213 // Or the value with the sign bit. 7214 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7215} 7216 7217/// Emit nodes that will be selected as "test Op0,Op0", or something 7218/// equivalent. 7219SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7220 SelectionDAG &DAG) const { 7221 DebugLoc dl = Op.getDebugLoc(); 7222 7223 // CF and OF aren't always set the way we want. Determine which 7224 // of these we need. 7225 bool NeedCF = false; 7226 bool NeedOF = false; 7227 switch (X86CC) { 7228 default: break; 7229 case X86::COND_A: case X86::COND_AE: 7230 case X86::COND_B: case X86::COND_BE: 7231 NeedCF = true; 7232 break; 7233 case X86::COND_G: case X86::COND_GE: 7234 case X86::COND_L: case X86::COND_LE: 7235 case X86::COND_O: case X86::COND_NO: 7236 NeedOF = true; 7237 break; 7238 } 7239 7240 // See if we can use the EFLAGS value from the operand instead of 7241 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7242 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7243 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7244 // Emit a CMP with 0, which is the TEST pattern. 7245 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7246 DAG.getConstant(0, Op.getValueType())); 7247 7248 unsigned Opcode = 0; 7249 unsigned NumOperands = 0; 7250 switch (Op.getNode()->getOpcode()) { 7251 case ISD::ADD: 7252 // Due to an isel shortcoming, be conservative if this add is likely to be 7253 // selected as part of a load-modify-store instruction. When the root node 7254 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7255 // uses of other nodes in the match, such as the ADD in this case. This 7256 // leads to the ADD being left around and reselected, with the result being 7257 // two adds in the output. Alas, even if none our users are stores, that 7258 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7259 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7260 // climbing the DAG back to the root, and it doesn't seem to be worth the 7261 // effort. 7262 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7263 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7264 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7265 goto default_case; 7266 7267 if (ConstantSDNode *C = 7268 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7269 // An add of one will be selected as an INC. 7270 if (C->getAPIntValue() == 1) { 7271 Opcode = X86ISD::INC; 7272 NumOperands = 1; 7273 break; 7274 } 7275 7276 // An add of negative one (subtract of one) will be selected as a DEC. 7277 if (C->getAPIntValue().isAllOnesValue()) { 7278 Opcode = X86ISD::DEC; 7279 NumOperands = 1; 7280 break; 7281 } 7282 } 7283 7284 // Otherwise use a regular EFLAGS-setting add. 7285 Opcode = X86ISD::ADD; 7286 NumOperands = 2; 7287 break; 7288 case ISD::AND: { 7289 // If the primary and result isn't used, don't bother using X86ISD::AND, 7290 // because a TEST instruction will be better. 7291 bool NonFlagUse = false; 7292 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7293 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7294 SDNode *User = *UI; 7295 unsigned UOpNo = UI.getOperandNo(); 7296 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7297 // Look pass truncate. 7298 UOpNo = User->use_begin().getOperandNo(); 7299 User = *User->use_begin(); 7300 } 7301 7302 if (User->getOpcode() != ISD::BRCOND && 7303 User->getOpcode() != ISD::SETCC && 7304 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7305 NonFlagUse = true; 7306 break; 7307 } 7308 } 7309 7310 if (!NonFlagUse) 7311 break; 7312 } 7313 // FALL THROUGH 7314 case ISD::SUB: 7315 case ISD::OR: 7316 case ISD::XOR: 7317 // Due to the ISEL shortcoming noted above, be conservative if this op is 7318 // likely to be selected as part of a load-modify-store instruction. 7319 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7320 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7321 if (UI->getOpcode() == ISD::STORE) 7322 goto default_case; 7323 7324 // Otherwise use a regular EFLAGS-setting instruction. 7325 switch (Op.getNode()->getOpcode()) { 7326 default: llvm_unreachable("unexpected operator!"); 7327 case ISD::SUB: Opcode = X86ISD::SUB; break; 7328 case ISD::OR: Opcode = X86ISD::OR; break; 7329 case ISD::XOR: Opcode = X86ISD::XOR; break; 7330 case ISD::AND: Opcode = X86ISD::AND; break; 7331 } 7332 7333 NumOperands = 2; 7334 break; 7335 case X86ISD::ADD: 7336 case X86ISD::SUB: 7337 case X86ISD::INC: 7338 case X86ISD::DEC: 7339 case X86ISD::OR: 7340 case X86ISD::XOR: 7341 case X86ISD::AND: 7342 return SDValue(Op.getNode(), 1); 7343 default: 7344 default_case: 7345 break; 7346 } 7347 7348 if (Opcode == 0) 7349 // Emit a CMP with 0, which is the TEST pattern. 7350 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7351 DAG.getConstant(0, Op.getValueType())); 7352 7353 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7354 SmallVector<SDValue, 4> Ops; 7355 for (unsigned i = 0; i != NumOperands; ++i) 7356 Ops.push_back(Op.getOperand(i)); 7357 7358 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7359 DAG.ReplaceAllUsesWith(Op, New); 7360 return SDValue(New.getNode(), 1); 7361} 7362 7363/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7364/// equivalent. 7365SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7366 SelectionDAG &DAG) const { 7367 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7368 if (C->getAPIntValue() == 0) 7369 return EmitTest(Op0, X86CC, DAG); 7370 7371 DebugLoc dl = Op0.getDebugLoc(); 7372 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7373} 7374 7375/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7376/// if it's possible. 7377SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7378 DebugLoc dl, SelectionDAG &DAG) const { 7379 SDValue Op0 = And.getOperand(0); 7380 SDValue Op1 = And.getOperand(1); 7381 if (Op0.getOpcode() == ISD::TRUNCATE) 7382 Op0 = Op0.getOperand(0); 7383 if (Op1.getOpcode() == ISD::TRUNCATE) 7384 Op1 = Op1.getOperand(0); 7385 7386 SDValue LHS, RHS; 7387 if (Op1.getOpcode() == ISD::SHL) 7388 std::swap(Op0, Op1); 7389 if (Op0.getOpcode() == ISD::SHL) { 7390 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7391 if (And00C->getZExtValue() == 1) { 7392 // If we looked past a truncate, check that it's only truncating away 7393 // known zeros. 7394 unsigned BitWidth = Op0.getValueSizeInBits(); 7395 unsigned AndBitWidth = And.getValueSizeInBits(); 7396 if (BitWidth > AndBitWidth) { 7397 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7398 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7399 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7400 return SDValue(); 7401 } 7402 LHS = Op1; 7403 RHS = Op0.getOperand(1); 7404 } 7405 } else if (Op1.getOpcode() == ISD::Constant) { 7406 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7407 SDValue AndLHS = Op0; 7408 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7409 LHS = AndLHS.getOperand(0); 7410 RHS = AndLHS.getOperand(1); 7411 } 7412 } 7413 7414 if (LHS.getNode()) { 7415 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7416 // instruction. Since the shift amount is in-range-or-undefined, we know 7417 // that doing a bittest on the i32 value is ok. We extend to i32 because 7418 // the encoding for the i16 version is larger than the i32 version. 7419 // Also promote i16 to i32 for performance / code size reason. 7420 if (LHS.getValueType() == MVT::i8 || 7421 LHS.getValueType() == MVT::i16) 7422 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7423 7424 // If the operand types disagree, extend the shift amount to match. Since 7425 // BT ignores high bits (like shifts) we can use anyextend. 7426 if (LHS.getValueType() != RHS.getValueType()) 7427 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7428 7429 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7430 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7431 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7432 DAG.getConstant(Cond, MVT::i8), BT); 7433 } 7434 7435 return SDValue(); 7436} 7437 7438SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7439 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7440 SDValue Op0 = Op.getOperand(0); 7441 SDValue Op1 = Op.getOperand(1); 7442 DebugLoc dl = Op.getDebugLoc(); 7443 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7444 7445 // Optimize to BT if possible. 7446 // Lower (X & (1 << N)) == 0 to BT(X, N). 7447 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7448 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7449 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7450 Op1.getOpcode() == ISD::Constant && 7451 cast<ConstantSDNode>(Op1)->isNullValue() && 7452 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7453 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7454 if (NewSetCC.getNode()) 7455 return NewSetCC; 7456 } 7457 7458 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7459 // these. 7460 if (Op1.getOpcode() == ISD::Constant && 7461 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7462 cast<ConstantSDNode>(Op1)->isNullValue()) && 7463 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7464 7465 // If the input is a setcc, then reuse the input setcc or use a new one with 7466 // the inverted condition. 7467 if (Op0.getOpcode() == X86ISD::SETCC) { 7468 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7469 bool Invert = (CC == ISD::SETNE) ^ 7470 cast<ConstantSDNode>(Op1)->isNullValue(); 7471 if (!Invert) return Op0; 7472 7473 CCode = X86::GetOppositeBranchCondition(CCode); 7474 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7475 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7476 } 7477 } 7478 7479 bool isFP = Op1.getValueType().isFloatingPoint(); 7480 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7481 if (X86CC == X86::COND_INVALID) 7482 return SDValue(); 7483 7484 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7485 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7486 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7487} 7488 7489SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7490 SDValue Cond; 7491 SDValue Op0 = Op.getOperand(0); 7492 SDValue Op1 = Op.getOperand(1); 7493 SDValue CC = Op.getOperand(2); 7494 EVT VT = Op.getValueType(); 7495 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7496 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7497 DebugLoc dl = Op.getDebugLoc(); 7498 7499 if (isFP) { 7500 unsigned SSECC = 8; 7501 EVT VT0 = Op0.getValueType(); 7502 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7503 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7504 bool Swap = false; 7505 7506 switch (SetCCOpcode) { 7507 default: break; 7508 case ISD::SETOEQ: 7509 case ISD::SETEQ: SSECC = 0; break; 7510 case ISD::SETOGT: 7511 case ISD::SETGT: Swap = true; // Fallthrough 7512 case ISD::SETLT: 7513 case ISD::SETOLT: SSECC = 1; break; 7514 case ISD::SETOGE: 7515 case ISD::SETGE: Swap = true; // Fallthrough 7516 case ISD::SETLE: 7517 case ISD::SETOLE: SSECC = 2; break; 7518 case ISD::SETUO: SSECC = 3; break; 7519 case ISD::SETUNE: 7520 case ISD::SETNE: SSECC = 4; break; 7521 case ISD::SETULE: Swap = true; 7522 case ISD::SETUGE: SSECC = 5; break; 7523 case ISD::SETULT: Swap = true; 7524 case ISD::SETUGT: SSECC = 6; break; 7525 case ISD::SETO: SSECC = 7; break; 7526 } 7527 if (Swap) 7528 std::swap(Op0, Op1); 7529 7530 // In the two special cases we can't handle, emit two comparisons. 7531 if (SSECC == 8) { 7532 if (SetCCOpcode == ISD::SETUEQ) { 7533 SDValue UNORD, EQ; 7534 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7535 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7536 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7537 } 7538 else if (SetCCOpcode == ISD::SETONE) { 7539 SDValue ORD, NEQ; 7540 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7541 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7542 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7543 } 7544 llvm_unreachable("Illegal FP comparison"); 7545 } 7546 // Handle all other FP comparisons here. 7547 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7548 } 7549 7550 // We are handling one of the integer comparisons here. Since SSE only has 7551 // GT and EQ comparisons for integer, swapping operands and multiple 7552 // operations may be required for some comparisons. 7553 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7554 bool Swap = false, Invert = false, FlipSigns = false; 7555 7556 switch (VT.getSimpleVT().SimpleTy) { 7557 default: break; 7558 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7559 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7560 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7561 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7562 } 7563 7564 switch (SetCCOpcode) { 7565 default: break; 7566 case ISD::SETNE: Invert = true; 7567 case ISD::SETEQ: Opc = EQOpc; break; 7568 case ISD::SETLT: Swap = true; 7569 case ISD::SETGT: Opc = GTOpc; break; 7570 case ISD::SETGE: Swap = true; 7571 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7572 case ISD::SETULT: Swap = true; 7573 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7574 case ISD::SETUGE: Swap = true; 7575 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7576 } 7577 if (Swap) 7578 std::swap(Op0, Op1); 7579 7580 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7581 // bits of the inputs before performing those operations. 7582 if (FlipSigns) { 7583 EVT EltVT = VT.getVectorElementType(); 7584 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7585 EltVT); 7586 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7587 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7588 SignBits.size()); 7589 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7590 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7591 } 7592 7593 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7594 7595 // If the logical-not of the result is required, perform that now. 7596 if (Invert) 7597 Result = DAG.getNOT(dl, Result, VT); 7598 7599 return Result; 7600} 7601 7602// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7603static bool isX86LogicalCmp(SDValue Op) { 7604 unsigned Opc = Op.getNode()->getOpcode(); 7605 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7606 return true; 7607 if (Op.getResNo() == 1 && 7608 (Opc == X86ISD::ADD || 7609 Opc == X86ISD::SUB || 7610 Opc == X86ISD::ADC || 7611 Opc == X86ISD::SBB || 7612 Opc == X86ISD::SMUL || 7613 Opc == X86ISD::UMUL || 7614 Opc == X86ISD::INC || 7615 Opc == X86ISD::DEC || 7616 Opc == X86ISD::OR || 7617 Opc == X86ISD::XOR || 7618 Opc == X86ISD::AND)) 7619 return true; 7620 7621 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7622 return true; 7623 7624 return false; 7625} 7626 7627static bool isZero(SDValue V) { 7628 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7629 return C && C->isNullValue(); 7630} 7631 7632static bool isAllOnes(SDValue V) { 7633 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7634 return C && C->isAllOnesValue(); 7635} 7636 7637SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7638 bool addTest = true; 7639 SDValue Cond = Op.getOperand(0); 7640 SDValue Op1 = Op.getOperand(1); 7641 SDValue Op2 = Op.getOperand(2); 7642 DebugLoc DL = Op.getDebugLoc(); 7643 SDValue CC; 7644 7645 if (Cond.getOpcode() == ISD::SETCC) { 7646 SDValue NewCond = LowerSETCC(Cond, DAG); 7647 if (NewCond.getNode()) 7648 Cond = NewCond; 7649 } 7650 7651 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7652 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7653 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7654 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7655 if (Cond.getOpcode() == X86ISD::SETCC && 7656 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7657 isZero(Cond.getOperand(1).getOperand(1))) { 7658 SDValue Cmp = Cond.getOperand(1); 7659 7660 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7661 7662 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7663 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7664 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7665 7666 SDValue CmpOp0 = Cmp.getOperand(0); 7667 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7668 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7669 7670 SDValue Res = // Res = 0 or -1. 7671 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7672 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7673 7674 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7675 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7676 7677 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7678 if (N2C == 0 || !N2C->isNullValue()) 7679 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7680 return Res; 7681 } 7682 } 7683 7684 // Look past (and (setcc_carry (cmp ...)), 1). 7685 if (Cond.getOpcode() == ISD::AND && 7686 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7687 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7688 if (C && C->getAPIntValue() == 1) 7689 Cond = Cond.getOperand(0); 7690 } 7691 7692 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7693 // setting operand in place of the X86ISD::SETCC. 7694 if (Cond.getOpcode() == X86ISD::SETCC || 7695 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7696 CC = Cond.getOperand(0); 7697 7698 SDValue Cmp = Cond.getOperand(1); 7699 unsigned Opc = Cmp.getOpcode(); 7700 EVT VT = Op.getValueType(); 7701 7702 bool IllegalFPCMov = false; 7703 if (VT.isFloatingPoint() && !VT.isVector() && 7704 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7705 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7706 7707 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7708 Opc == X86ISD::BT) { // FIXME 7709 Cond = Cmp; 7710 addTest = false; 7711 } 7712 } 7713 7714 if (addTest) { 7715 // Look pass the truncate. 7716 if (Cond.getOpcode() == ISD::TRUNCATE) 7717 Cond = Cond.getOperand(0); 7718 7719 // We know the result of AND is compared against zero. Try to match 7720 // it to BT. 7721 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7722 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7723 if (NewSetCC.getNode()) { 7724 CC = NewSetCC.getOperand(0); 7725 Cond = NewSetCC.getOperand(1); 7726 addTest = false; 7727 } 7728 } 7729 } 7730 7731 if (addTest) { 7732 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7733 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7734 } 7735 7736 // a < b ? -1 : 0 -> RES = ~setcc_carry 7737 // a < b ? 0 : -1 -> RES = setcc_carry 7738 // a >= b ? -1 : 0 -> RES = setcc_carry 7739 // a >= b ? 0 : -1 -> RES = ~setcc_carry 7740 if (Cond.getOpcode() == X86ISD::CMP) { 7741 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 7742 7743 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 7744 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 7745 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7746 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 7747 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 7748 return DAG.getNOT(DL, Res, Res.getValueType()); 7749 return Res; 7750 } 7751 } 7752 7753 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7754 // condition is true. 7755 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 7756 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7757 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7758} 7759 7760// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7761// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7762// from the AND / OR. 7763static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7764 Opc = Op.getOpcode(); 7765 if (Opc != ISD::OR && Opc != ISD::AND) 7766 return false; 7767 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7768 Op.getOperand(0).hasOneUse() && 7769 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7770 Op.getOperand(1).hasOneUse()); 7771} 7772 7773// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7774// 1 and that the SETCC node has a single use. 7775static bool isXor1OfSetCC(SDValue Op) { 7776 if (Op.getOpcode() != ISD::XOR) 7777 return false; 7778 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7779 if (N1C && N1C->getAPIntValue() == 1) { 7780 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7781 Op.getOperand(0).hasOneUse(); 7782 } 7783 return false; 7784} 7785 7786SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7787 bool addTest = true; 7788 SDValue Chain = Op.getOperand(0); 7789 SDValue Cond = Op.getOperand(1); 7790 SDValue Dest = Op.getOperand(2); 7791 DebugLoc dl = Op.getDebugLoc(); 7792 SDValue CC; 7793 7794 if (Cond.getOpcode() == ISD::SETCC) { 7795 SDValue NewCond = LowerSETCC(Cond, DAG); 7796 if (NewCond.getNode()) 7797 Cond = NewCond; 7798 } 7799#if 0 7800 // FIXME: LowerXALUO doesn't handle these!! 7801 else if (Cond.getOpcode() == X86ISD::ADD || 7802 Cond.getOpcode() == X86ISD::SUB || 7803 Cond.getOpcode() == X86ISD::SMUL || 7804 Cond.getOpcode() == X86ISD::UMUL) 7805 Cond = LowerXALUO(Cond, DAG); 7806#endif 7807 7808 // Look pass (and (setcc_carry (cmp ...)), 1). 7809 if (Cond.getOpcode() == ISD::AND && 7810 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7811 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7812 if (C && C->getAPIntValue() == 1) 7813 Cond = Cond.getOperand(0); 7814 } 7815 7816 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7817 // setting operand in place of the X86ISD::SETCC. 7818 if (Cond.getOpcode() == X86ISD::SETCC || 7819 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7820 CC = Cond.getOperand(0); 7821 7822 SDValue Cmp = Cond.getOperand(1); 7823 unsigned Opc = Cmp.getOpcode(); 7824 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7825 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7826 Cond = Cmp; 7827 addTest = false; 7828 } else { 7829 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7830 default: break; 7831 case X86::COND_O: 7832 case X86::COND_B: 7833 // These can only come from an arithmetic instruction with overflow, 7834 // e.g. SADDO, UADDO. 7835 Cond = Cond.getNode()->getOperand(1); 7836 addTest = false; 7837 break; 7838 } 7839 } 7840 } else { 7841 unsigned CondOpc; 7842 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7843 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7844 if (CondOpc == ISD::OR) { 7845 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7846 // two branches instead of an explicit OR instruction with a 7847 // separate test. 7848 if (Cmp == Cond.getOperand(1).getOperand(1) && 7849 isX86LogicalCmp(Cmp)) { 7850 CC = Cond.getOperand(0).getOperand(0); 7851 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7852 Chain, Dest, CC, Cmp); 7853 CC = Cond.getOperand(1).getOperand(0); 7854 Cond = Cmp; 7855 addTest = false; 7856 } 7857 } else { // ISD::AND 7858 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7859 // two branches instead of an explicit AND instruction with a 7860 // separate test. However, we only do this if this block doesn't 7861 // have a fall-through edge, because this requires an explicit 7862 // jmp when the condition is false. 7863 if (Cmp == Cond.getOperand(1).getOperand(1) && 7864 isX86LogicalCmp(Cmp) && 7865 Op.getNode()->hasOneUse()) { 7866 X86::CondCode CCode = 7867 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7868 CCode = X86::GetOppositeBranchCondition(CCode); 7869 CC = DAG.getConstant(CCode, MVT::i8); 7870 SDNode *User = *Op.getNode()->use_begin(); 7871 // Look for an unconditional branch following this conditional branch. 7872 // We need this because we need to reverse the successors in order 7873 // to implement FCMP_OEQ. 7874 if (User->getOpcode() == ISD::BR) { 7875 SDValue FalseBB = User->getOperand(1); 7876 SDNode *NewBR = 7877 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7878 assert(NewBR == User); 7879 (void)NewBR; 7880 Dest = FalseBB; 7881 7882 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7883 Chain, Dest, CC, Cmp); 7884 X86::CondCode CCode = 7885 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7886 CCode = X86::GetOppositeBranchCondition(CCode); 7887 CC = DAG.getConstant(CCode, MVT::i8); 7888 Cond = Cmp; 7889 addTest = false; 7890 } 7891 } 7892 } 7893 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7894 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7895 // It should be transformed during dag combiner except when the condition 7896 // is set by a arithmetics with overflow node. 7897 X86::CondCode CCode = 7898 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7899 CCode = X86::GetOppositeBranchCondition(CCode); 7900 CC = DAG.getConstant(CCode, MVT::i8); 7901 Cond = Cond.getOperand(0).getOperand(1); 7902 addTest = false; 7903 } 7904 } 7905 7906 if (addTest) { 7907 // Look pass the truncate. 7908 if (Cond.getOpcode() == ISD::TRUNCATE) 7909 Cond = Cond.getOperand(0); 7910 7911 // We know the result of AND is compared against zero. Try to match 7912 // it to BT. 7913 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7914 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7915 if (NewSetCC.getNode()) { 7916 CC = NewSetCC.getOperand(0); 7917 Cond = NewSetCC.getOperand(1); 7918 addTest = false; 7919 } 7920 } 7921 } 7922 7923 if (addTest) { 7924 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7925 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7926 } 7927 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7928 Chain, Dest, CC, Cond); 7929} 7930 7931 7932// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7933// Calls to _alloca is needed to probe the stack when allocating more than 4k 7934// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7935// that the guard pages used by the OS virtual memory manager are allocated in 7936// correct sequence. 7937SDValue 7938X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7939 SelectionDAG &DAG) const { 7940 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7941 "This should be used only on Windows targets"); 7942 assert(!Subtarget->isTargetEnvMacho()); 7943 DebugLoc dl = Op.getDebugLoc(); 7944 7945 // Get the inputs. 7946 SDValue Chain = Op.getOperand(0); 7947 SDValue Size = Op.getOperand(1); 7948 // FIXME: Ensure alignment here 7949 7950 SDValue Flag; 7951 7952 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7953 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 7954 7955 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 7956 Flag = Chain.getValue(1); 7957 7958 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7959 7960 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7961 Flag = Chain.getValue(1); 7962 7963 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7964 7965 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7966 return DAG.getMergeValues(Ops1, 2, dl); 7967} 7968 7969SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7970 MachineFunction &MF = DAG.getMachineFunction(); 7971 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7972 7973 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7974 DebugLoc DL = Op.getDebugLoc(); 7975 7976 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7977 // vastart just stores the address of the VarArgsFrameIndex slot into the 7978 // memory location argument. 7979 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7980 getPointerTy()); 7981 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7982 MachinePointerInfo(SV), false, false, 0); 7983 } 7984 7985 // __va_list_tag: 7986 // gp_offset (0 - 6 * 8) 7987 // fp_offset (48 - 48 + 8 * 16) 7988 // overflow_arg_area (point to parameters coming in memory). 7989 // reg_save_area 7990 SmallVector<SDValue, 8> MemOps; 7991 SDValue FIN = Op.getOperand(1); 7992 // Store gp_offset 7993 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7994 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7995 MVT::i32), 7996 FIN, MachinePointerInfo(SV), false, false, 0); 7997 MemOps.push_back(Store); 7998 7999 // Store fp_offset 8000 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8001 FIN, DAG.getIntPtrConstant(4)); 8002 Store = DAG.getStore(Op.getOperand(0), DL, 8003 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8004 MVT::i32), 8005 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8006 MemOps.push_back(Store); 8007 8008 // Store ptr to overflow_arg_area 8009 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8010 FIN, DAG.getIntPtrConstant(4)); 8011 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8012 getPointerTy()); 8013 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8014 MachinePointerInfo(SV, 8), 8015 false, false, 0); 8016 MemOps.push_back(Store); 8017 8018 // Store ptr to reg_save_area. 8019 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8020 FIN, DAG.getIntPtrConstant(8)); 8021 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8022 getPointerTy()); 8023 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8024 MachinePointerInfo(SV, 16), false, false, 0); 8025 MemOps.push_back(Store); 8026 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8027 &MemOps[0], MemOps.size()); 8028} 8029 8030SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8031 assert(Subtarget->is64Bit() && 8032 "LowerVAARG only handles 64-bit va_arg!"); 8033 assert((Subtarget->isTargetLinux() || 8034 Subtarget->isTargetDarwin()) && 8035 "Unhandled target in LowerVAARG"); 8036 assert(Op.getNode()->getNumOperands() == 4); 8037 SDValue Chain = Op.getOperand(0); 8038 SDValue SrcPtr = Op.getOperand(1); 8039 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8040 unsigned Align = Op.getConstantOperandVal(3); 8041 DebugLoc dl = Op.getDebugLoc(); 8042 8043 EVT ArgVT = Op.getNode()->getValueType(0); 8044 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8045 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8046 uint8_t ArgMode; 8047 8048 // Decide which area this value should be read from. 8049 // TODO: Implement the AMD64 ABI in its entirety. This simple 8050 // selection mechanism works only for the basic types. 8051 if (ArgVT == MVT::f80) { 8052 llvm_unreachable("va_arg for f80 not yet implemented"); 8053 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8054 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8055 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8056 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8057 } else { 8058 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8059 } 8060 8061 if (ArgMode == 2) { 8062 // Sanity Check: Make sure using fp_offset makes sense. 8063 assert(!UseSoftFloat && 8064 !(DAG.getMachineFunction() 8065 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8066 Subtarget->hasXMM()); 8067 } 8068 8069 // Insert VAARG_64 node into the DAG 8070 // VAARG_64 returns two values: Variable Argument Address, Chain 8071 SmallVector<SDValue, 11> InstOps; 8072 InstOps.push_back(Chain); 8073 InstOps.push_back(SrcPtr); 8074 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8075 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8076 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8077 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8078 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8079 VTs, &InstOps[0], InstOps.size(), 8080 MVT::i64, 8081 MachinePointerInfo(SV), 8082 /*Align=*/0, 8083 /*Volatile=*/false, 8084 /*ReadMem=*/true, 8085 /*WriteMem=*/true); 8086 Chain = VAARG.getValue(1); 8087 8088 // Load the next argument and return it 8089 return DAG.getLoad(ArgVT, dl, 8090 Chain, 8091 VAARG, 8092 MachinePointerInfo(), 8093 false, false, 0); 8094} 8095 8096SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8097 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8098 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8099 SDValue Chain = Op.getOperand(0); 8100 SDValue DstPtr = Op.getOperand(1); 8101 SDValue SrcPtr = Op.getOperand(2); 8102 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8103 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8104 DebugLoc DL = Op.getDebugLoc(); 8105 8106 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8107 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8108 false, 8109 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8110} 8111 8112SDValue 8113X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8114 DebugLoc dl = Op.getDebugLoc(); 8115 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8116 switch (IntNo) { 8117 default: return SDValue(); // Don't custom lower most intrinsics. 8118 // Comparison intrinsics. 8119 case Intrinsic::x86_sse_comieq_ss: 8120 case Intrinsic::x86_sse_comilt_ss: 8121 case Intrinsic::x86_sse_comile_ss: 8122 case Intrinsic::x86_sse_comigt_ss: 8123 case Intrinsic::x86_sse_comige_ss: 8124 case Intrinsic::x86_sse_comineq_ss: 8125 case Intrinsic::x86_sse_ucomieq_ss: 8126 case Intrinsic::x86_sse_ucomilt_ss: 8127 case Intrinsic::x86_sse_ucomile_ss: 8128 case Intrinsic::x86_sse_ucomigt_ss: 8129 case Intrinsic::x86_sse_ucomige_ss: 8130 case Intrinsic::x86_sse_ucomineq_ss: 8131 case Intrinsic::x86_sse2_comieq_sd: 8132 case Intrinsic::x86_sse2_comilt_sd: 8133 case Intrinsic::x86_sse2_comile_sd: 8134 case Intrinsic::x86_sse2_comigt_sd: 8135 case Intrinsic::x86_sse2_comige_sd: 8136 case Intrinsic::x86_sse2_comineq_sd: 8137 case Intrinsic::x86_sse2_ucomieq_sd: 8138 case Intrinsic::x86_sse2_ucomilt_sd: 8139 case Intrinsic::x86_sse2_ucomile_sd: 8140 case Intrinsic::x86_sse2_ucomigt_sd: 8141 case Intrinsic::x86_sse2_ucomige_sd: 8142 case Intrinsic::x86_sse2_ucomineq_sd: { 8143 unsigned Opc = 0; 8144 ISD::CondCode CC = ISD::SETCC_INVALID; 8145 switch (IntNo) { 8146 default: break; 8147 case Intrinsic::x86_sse_comieq_ss: 8148 case Intrinsic::x86_sse2_comieq_sd: 8149 Opc = X86ISD::COMI; 8150 CC = ISD::SETEQ; 8151 break; 8152 case Intrinsic::x86_sse_comilt_ss: 8153 case Intrinsic::x86_sse2_comilt_sd: 8154 Opc = X86ISD::COMI; 8155 CC = ISD::SETLT; 8156 break; 8157 case Intrinsic::x86_sse_comile_ss: 8158 case Intrinsic::x86_sse2_comile_sd: 8159 Opc = X86ISD::COMI; 8160 CC = ISD::SETLE; 8161 break; 8162 case Intrinsic::x86_sse_comigt_ss: 8163 case Intrinsic::x86_sse2_comigt_sd: 8164 Opc = X86ISD::COMI; 8165 CC = ISD::SETGT; 8166 break; 8167 case Intrinsic::x86_sse_comige_ss: 8168 case Intrinsic::x86_sse2_comige_sd: 8169 Opc = X86ISD::COMI; 8170 CC = ISD::SETGE; 8171 break; 8172 case Intrinsic::x86_sse_comineq_ss: 8173 case Intrinsic::x86_sse2_comineq_sd: 8174 Opc = X86ISD::COMI; 8175 CC = ISD::SETNE; 8176 break; 8177 case Intrinsic::x86_sse_ucomieq_ss: 8178 case Intrinsic::x86_sse2_ucomieq_sd: 8179 Opc = X86ISD::UCOMI; 8180 CC = ISD::SETEQ; 8181 break; 8182 case Intrinsic::x86_sse_ucomilt_ss: 8183 case Intrinsic::x86_sse2_ucomilt_sd: 8184 Opc = X86ISD::UCOMI; 8185 CC = ISD::SETLT; 8186 break; 8187 case Intrinsic::x86_sse_ucomile_ss: 8188 case Intrinsic::x86_sse2_ucomile_sd: 8189 Opc = X86ISD::UCOMI; 8190 CC = ISD::SETLE; 8191 break; 8192 case Intrinsic::x86_sse_ucomigt_ss: 8193 case Intrinsic::x86_sse2_ucomigt_sd: 8194 Opc = X86ISD::UCOMI; 8195 CC = ISD::SETGT; 8196 break; 8197 case Intrinsic::x86_sse_ucomige_ss: 8198 case Intrinsic::x86_sse2_ucomige_sd: 8199 Opc = X86ISD::UCOMI; 8200 CC = ISD::SETGE; 8201 break; 8202 case Intrinsic::x86_sse_ucomineq_ss: 8203 case Intrinsic::x86_sse2_ucomineq_sd: 8204 Opc = X86ISD::UCOMI; 8205 CC = ISD::SETNE; 8206 break; 8207 } 8208 8209 SDValue LHS = Op.getOperand(1); 8210 SDValue RHS = Op.getOperand(2); 8211 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8212 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8213 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8214 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8215 DAG.getConstant(X86CC, MVT::i8), Cond); 8216 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8217 } 8218 // ptest and testp intrinsics. The intrinsic these come from are designed to 8219 // return an integer value, not just an instruction so lower it to the ptest 8220 // or testp pattern and a setcc for the result. 8221 case Intrinsic::x86_sse41_ptestz: 8222 case Intrinsic::x86_sse41_ptestc: 8223 case Intrinsic::x86_sse41_ptestnzc: 8224 case Intrinsic::x86_avx_ptestz_256: 8225 case Intrinsic::x86_avx_ptestc_256: 8226 case Intrinsic::x86_avx_ptestnzc_256: 8227 case Intrinsic::x86_avx_vtestz_ps: 8228 case Intrinsic::x86_avx_vtestc_ps: 8229 case Intrinsic::x86_avx_vtestnzc_ps: 8230 case Intrinsic::x86_avx_vtestz_pd: 8231 case Intrinsic::x86_avx_vtestc_pd: 8232 case Intrinsic::x86_avx_vtestnzc_pd: 8233 case Intrinsic::x86_avx_vtestz_ps_256: 8234 case Intrinsic::x86_avx_vtestc_ps_256: 8235 case Intrinsic::x86_avx_vtestnzc_ps_256: 8236 case Intrinsic::x86_avx_vtestz_pd_256: 8237 case Intrinsic::x86_avx_vtestc_pd_256: 8238 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8239 bool IsTestPacked = false; 8240 unsigned X86CC = 0; 8241 switch (IntNo) { 8242 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8243 case Intrinsic::x86_avx_vtestz_ps: 8244 case Intrinsic::x86_avx_vtestz_pd: 8245 case Intrinsic::x86_avx_vtestz_ps_256: 8246 case Intrinsic::x86_avx_vtestz_pd_256: 8247 IsTestPacked = true; // Fallthrough 8248 case Intrinsic::x86_sse41_ptestz: 8249 case Intrinsic::x86_avx_ptestz_256: 8250 // ZF = 1 8251 X86CC = X86::COND_E; 8252 break; 8253 case Intrinsic::x86_avx_vtestc_ps: 8254 case Intrinsic::x86_avx_vtestc_pd: 8255 case Intrinsic::x86_avx_vtestc_ps_256: 8256 case Intrinsic::x86_avx_vtestc_pd_256: 8257 IsTestPacked = true; // Fallthrough 8258 case Intrinsic::x86_sse41_ptestc: 8259 case Intrinsic::x86_avx_ptestc_256: 8260 // CF = 1 8261 X86CC = X86::COND_B; 8262 break; 8263 case Intrinsic::x86_avx_vtestnzc_ps: 8264 case Intrinsic::x86_avx_vtestnzc_pd: 8265 case Intrinsic::x86_avx_vtestnzc_ps_256: 8266 case Intrinsic::x86_avx_vtestnzc_pd_256: 8267 IsTestPacked = true; // Fallthrough 8268 case Intrinsic::x86_sse41_ptestnzc: 8269 case Intrinsic::x86_avx_ptestnzc_256: 8270 // ZF and CF = 0 8271 X86CC = X86::COND_A; 8272 break; 8273 } 8274 8275 SDValue LHS = Op.getOperand(1); 8276 SDValue RHS = Op.getOperand(2); 8277 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8278 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8279 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8280 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8281 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8282 } 8283 8284 // Fix vector shift instructions where the last operand is a non-immediate 8285 // i32 value. 8286 case Intrinsic::x86_sse2_pslli_w: 8287 case Intrinsic::x86_sse2_pslli_d: 8288 case Intrinsic::x86_sse2_pslli_q: 8289 case Intrinsic::x86_sse2_psrli_w: 8290 case Intrinsic::x86_sse2_psrli_d: 8291 case Intrinsic::x86_sse2_psrli_q: 8292 case Intrinsic::x86_sse2_psrai_w: 8293 case Intrinsic::x86_sse2_psrai_d: 8294 case Intrinsic::x86_mmx_pslli_w: 8295 case Intrinsic::x86_mmx_pslli_d: 8296 case Intrinsic::x86_mmx_pslli_q: 8297 case Intrinsic::x86_mmx_psrli_w: 8298 case Intrinsic::x86_mmx_psrli_d: 8299 case Intrinsic::x86_mmx_psrli_q: 8300 case Intrinsic::x86_mmx_psrai_w: 8301 case Intrinsic::x86_mmx_psrai_d: { 8302 SDValue ShAmt = Op.getOperand(2); 8303 if (isa<ConstantSDNode>(ShAmt)) 8304 return SDValue(); 8305 8306 unsigned NewIntNo = 0; 8307 EVT ShAmtVT = MVT::v4i32; 8308 switch (IntNo) { 8309 case Intrinsic::x86_sse2_pslli_w: 8310 NewIntNo = Intrinsic::x86_sse2_psll_w; 8311 break; 8312 case Intrinsic::x86_sse2_pslli_d: 8313 NewIntNo = Intrinsic::x86_sse2_psll_d; 8314 break; 8315 case Intrinsic::x86_sse2_pslli_q: 8316 NewIntNo = Intrinsic::x86_sse2_psll_q; 8317 break; 8318 case Intrinsic::x86_sse2_psrli_w: 8319 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8320 break; 8321 case Intrinsic::x86_sse2_psrli_d: 8322 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8323 break; 8324 case Intrinsic::x86_sse2_psrli_q: 8325 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8326 break; 8327 case Intrinsic::x86_sse2_psrai_w: 8328 NewIntNo = Intrinsic::x86_sse2_psra_w; 8329 break; 8330 case Intrinsic::x86_sse2_psrai_d: 8331 NewIntNo = Intrinsic::x86_sse2_psra_d; 8332 break; 8333 default: { 8334 ShAmtVT = MVT::v2i32; 8335 switch (IntNo) { 8336 case Intrinsic::x86_mmx_pslli_w: 8337 NewIntNo = Intrinsic::x86_mmx_psll_w; 8338 break; 8339 case Intrinsic::x86_mmx_pslli_d: 8340 NewIntNo = Intrinsic::x86_mmx_psll_d; 8341 break; 8342 case Intrinsic::x86_mmx_pslli_q: 8343 NewIntNo = Intrinsic::x86_mmx_psll_q; 8344 break; 8345 case Intrinsic::x86_mmx_psrli_w: 8346 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8347 break; 8348 case Intrinsic::x86_mmx_psrli_d: 8349 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8350 break; 8351 case Intrinsic::x86_mmx_psrli_q: 8352 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8353 break; 8354 case Intrinsic::x86_mmx_psrai_w: 8355 NewIntNo = Intrinsic::x86_mmx_psra_w; 8356 break; 8357 case Intrinsic::x86_mmx_psrai_d: 8358 NewIntNo = Intrinsic::x86_mmx_psra_d; 8359 break; 8360 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8361 } 8362 break; 8363 } 8364 } 8365 8366 // The vector shift intrinsics with scalars uses 32b shift amounts but 8367 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8368 // to be zero. 8369 SDValue ShOps[4]; 8370 ShOps[0] = ShAmt; 8371 ShOps[1] = DAG.getConstant(0, MVT::i32); 8372 if (ShAmtVT == MVT::v4i32) { 8373 ShOps[2] = DAG.getUNDEF(MVT::i32); 8374 ShOps[3] = DAG.getUNDEF(MVT::i32); 8375 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8376 } else { 8377 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8378// FIXME this must be lowered to get rid of the invalid type. 8379 } 8380 8381 EVT VT = Op.getValueType(); 8382 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8383 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8384 DAG.getConstant(NewIntNo, MVT::i32), 8385 Op.getOperand(1), ShAmt); 8386 } 8387 } 8388} 8389 8390SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8391 SelectionDAG &DAG) const { 8392 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8393 MFI->setReturnAddressIsTaken(true); 8394 8395 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8396 DebugLoc dl = Op.getDebugLoc(); 8397 8398 if (Depth > 0) { 8399 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8400 SDValue Offset = 8401 DAG.getConstant(TD->getPointerSize(), 8402 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8403 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8404 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8405 FrameAddr, Offset), 8406 MachinePointerInfo(), false, false, 0); 8407 } 8408 8409 // Just load the return address. 8410 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8411 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8412 RetAddrFI, MachinePointerInfo(), false, false, 0); 8413} 8414 8415SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8416 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8417 MFI->setFrameAddressIsTaken(true); 8418 8419 EVT VT = Op.getValueType(); 8420 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8421 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8422 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8423 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8424 while (Depth--) 8425 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8426 MachinePointerInfo(), 8427 false, false, 0); 8428 return FrameAddr; 8429} 8430 8431SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8432 SelectionDAG &DAG) const { 8433 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8434} 8435 8436SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8437 MachineFunction &MF = DAG.getMachineFunction(); 8438 SDValue Chain = Op.getOperand(0); 8439 SDValue Offset = Op.getOperand(1); 8440 SDValue Handler = Op.getOperand(2); 8441 DebugLoc dl = Op.getDebugLoc(); 8442 8443 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8444 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8445 getPointerTy()); 8446 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8447 8448 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8449 DAG.getIntPtrConstant(TD->getPointerSize())); 8450 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8451 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8452 false, false, 0); 8453 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8454 MF.getRegInfo().addLiveOut(StoreAddrReg); 8455 8456 return DAG.getNode(X86ISD::EH_RETURN, dl, 8457 MVT::Other, 8458 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8459} 8460 8461SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8462 SelectionDAG &DAG) const { 8463 SDValue Root = Op.getOperand(0); 8464 SDValue Trmp = Op.getOperand(1); // trampoline 8465 SDValue FPtr = Op.getOperand(2); // nested function 8466 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8467 DebugLoc dl = Op.getDebugLoc(); 8468 8469 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8470 8471 if (Subtarget->is64Bit()) { 8472 SDValue OutChains[6]; 8473 8474 // Large code-model. 8475 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8476 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8477 8478 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8479 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8480 8481 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8482 8483 // Load the pointer to the nested function into R11. 8484 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8485 SDValue Addr = Trmp; 8486 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8487 Addr, MachinePointerInfo(TrmpAddr), 8488 false, false, 0); 8489 8490 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8491 DAG.getConstant(2, MVT::i64)); 8492 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8493 MachinePointerInfo(TrmpAddr, 2), 8494 false, false, 2); 8495 8496 // Load the 'nest' parameter value into R10. 8497 // R10 is specified in X86CallingConv.td 8498 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8499 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8500 DAG.getConstant(10, MVT::i64)); 8501 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8502 Addr, MachinePointerInfo(TrmpAddr, 10), 8503 false, false, 0); 8504 8505 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8506 DAG.getConstant(12, MVT::i64)); 8507 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8508 MachinePointerInfo(TrmpAddr, 12), 8509 false, false, 2); 8510 8511 // Jump to the nested function. 8512 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8513 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8514 DAG.getConstant(20, MVT::i64)); 8515 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8516 Addr, MachinePointerInfo(TrmpAddr, 20), 8517 false, false, 0); 8518 8519 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8520 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8521 DAG.getConstant(22, MVT::i64)); 8522 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8523 MachinePointerInfo(TrmpAddr, 22), 8524 false, false, 0); 8525 8526 SDValue Ops[] = 8527 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8528 return DAG.getMergeValues(Ops, 2, dl); 8529 } else { 8530 const Function *Func = 8531 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8532 CallingConv::ID CC = Func->getCallingConv(); 8533 unsigned NestReg; 8534 8535 switch (CC) { 8536 default: 8537 llvm_unreachable("Unsupported calling convention"); 8538 case CallingConv::C: 8539 case CallingConv::X86_StdCall: { 8540 // Pass 'nest' parameter in ECX. 8541 // Must be kept in sync with X86CallingConv.td 8542 NestReg = X86::ECX; 8543 8544 // Check that ECX wasn't needed by an 'inreg' parameter. 8545 const FunctionType *FTy = Func->getFunctionType(); 8546 const AttrListPtr &Attrs = Func->getAttributes(); 8547 8548 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8549 unsigned InRegCount = 0; 8550 unsigned Idx = 1; 8551 8552 for (FunctionType::param_iterator I = FTy->param_begin(), 8553 E = FTy->param_end(); I != E; ++I, ++Idx) 8554 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8555 // FIXME: should only count parameters that are lowered to integers. 8556 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8557 8558 if (InRegCount > 2) { 8559 report_fatal_error("Nest register in use - reduce number of inreg" 8560 " parameters!"); 8561 } 8562 } 8563 break; 8564 } 8565 case CallingConv::X86_FastCall: 8566 case CallingConv::X86_ThisCall: 8567 case CallingConv::Fast: 8568 // Pass 'nest' parameter in EAX. 8569 // Must be kept in sync with X86CallingConv.td 8570 NestReg = X86::EAX; 8571 break; 8572 } 8573 8574 SDValue OutChains[4]; 8575 SDValue Addr, Disp; 8576 8577 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8578 DAG.getConstant(10, MVT::i32)); 8579 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8580 8581 // This is storing the opcode for MOV32ri. 8582 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8583 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8584 OutChains[0] = DAG.getStore(Root, dl, 8585 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8586 Trmp, MachinePointerInfo(TrmpAddr), 8587 false, false, 0); 8588 8589 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8590 DAG.getConstant(1, MVT::i32)); 8591 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8592 MachinePointerInfo(TrmpAddr, 1), 8593 false, false, 1); 8594 8595 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8596 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8597 DAG.getConstant(5, MVT::i32)); 8598 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8599 MachinePointerInfo(TrmpAddr, 5), 8600 false, false, 1); 8601 8602 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8603 DAG.getConstant(6, MVT::i32)); 8604 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8605 MachinePointerInfo(TrmpAddr, 6), 8606 false, false, 1); 8607 8608 SDValue Ops[] = 8609 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8610 return DAG.getMergeValues(Ops, 2, dl); 8611 } 8612} 8613 8614SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8615 SelectionDAG &DAG) const { 8616 /* 8617 The rounding mode is in bits 11:10 of FPSR, and has the following 8618 settings: 8619 00 Round to nearest 8620 01 Round to -inf 8621 10 Round to +inf 8622 11 Round to 0 8623 8624 FLT_ROUNDS, on the other hand, expects the following: 8625 -1 Undefined 8626 0 Round to 0 8627 1 Round to nearest 8628 2 Round to +inf 8629 3 Round to -inf 8630 8631 To perform the conversion, we do: 8632 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8633 */ 8634 8635 MachineFunction &MF = DAG.getMachineFunction(); 8636 const TargetMachine &TM = MF.getTarget(); 8637 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8638 unsigned StackAlignment = TFI.getStackAlignment(); 8639 EVT VT = Op.getValueType(); 8640 DebugLoc DL = Op.getDebugLoc(); 8641 8642 // Save FP Control Word to stack slot 8643 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8644 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8645 8646 8647 MachineMemOperand *MMO = 8648 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8649 MachineMemOperand::MOStore, 2, 2); 8650 8651 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8652 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8653 DAG.getVTList(MVT::Other), 8654 Ops, 2, MVT::i16, MMO); 8655 8656 // Load FP Control Word from stack slot 8657 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8658 MachinePointerInfo(), false, false, 0); 8659 8660 // Transform as necessary 8661 SDValue CWD1 = 8662 DAG.getNode(ISD::SRL, DL, MVT::i16, 8663 DAG.getNode(ISD::AND, DL, MVT::i16, 8664 CWD, DAG.getConstant(0x800, MVT::i16)), 8665 DAG.getConstant(11, MVT::i8)); 8666 SDValue CWD2 = 8667 DAG.getNode(ISD::SRL, DL, MVT::i16, 8668 DAG.getNode(ISD::AND, DL, MVT::i16, 8669 CWD, DAG.getConstant(0x400, MVT::i16)), 8670 DAG.getConstant(9, MVT::i8)); 8671 8672 SDValue RetVal = 8673 DAG.getNode(ISD::AND, DL, MVT::i16, 8674 DAG.getNode(ISD::ADD, DL, MVT::i16, 8675 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8676 DAG.getConstant(1, MVT::i16)), 8677 DAG.getConstant(3, MVT::i16)); 8678 8679 8680 return DAG.getNode((VT.getSizeInBits() < 16 ? 8681 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8682} 8683 8684SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8685 EVT VT = Op.getValueType(); 8686 EVT OpVT = VT; 8687 unsigned NumBits = VT.getSizeInBits(); 8688 DebugLoc dl = Op.getDebugLoc(); 8689 8690 Op = Op.getOperand(0); 8691 if (VT == MVT::i8) { 8692 // Zero extend to i32 since there is not an i8 bsr. 8693 OpVT = MVT::i32; 8694 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8695 } 8696 8697 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8698 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8699 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8700 8701 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8702 SDValue Ops[] = { 8703 Op, 8704 DAG.getConstant(NumBits+NumBits-1, OpVT), 8705 DAG.getConstant(X86::COND_E, MVT::i8), 8706 Op.getValue(1) 8707 }; 8708 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8709 8710 // Finally xor with NumBits-1. 8711 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8712 8713 if (VT == MVT::i8) 8714 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8715 return Op; 8716} 8717 8718SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8719 EVT VT = Op.getValueType(); 8720 EVT OpVT = VT; 8721 unsigned NumBits = VT.getSizeInBits(); 8722 DebugLoc dl = Op.getDebugLoc(); 8723 8724 Op = Op.getOperand(0); 8725 if (VT == MVT::i8) { 8726 OpVT = MVT::i32; 8727 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8728 } 8729 8730 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8731 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8732 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8733 8734 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8735 SDValue Ops[] = { 8736 Op, 8737 DAG.getConstant(NumBits, OpVT), 8738 DAG.getConstant(X86::COND_E, MVT::i8), 8739 Op.getValue(1) 8740 }; 8741 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8742 8743 if (VT == MVT::i8) 8744 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8745 return Op; 8746} 8747 8748SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8749 EVT VT = Op.getValueType(); 8750 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8751 DebugLoc dl = Op.getDebugLoc(); 8752 8753 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8754 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8755 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8756 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8757 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8758 // 8759 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8760 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8761 // return AloBlo + AloBhi + AhiBlo; 8762 8763 SDValue A = Op.getOperand(0); 8764 SDValue B = Op.getOperand(1); 8765 8766 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8767 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8768 A, DAG.getConstant(32, MVT::i32)); 8769 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8770 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8771 B, DAG.getConstant(32, MVT::i32)); 8772 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8773 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8774 A, B); 8775 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8776 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8777 A, Bhi); 8778 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8779 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8780 Ahi, B); 8781 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8782 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8783 AloBhi, DAG.getConstant(32, MVT::i32)); 8784 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8785 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8786 AhiBlo, DAG.getConstant(32, MVT::i32)); 8787 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8788 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8789 return Res; 8790} 8791 8792SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 8793 8794 EVT VT = Op.getValueType(); 8795 DebugLoc dl = Op.getDebugLoc(); 8796 SDValue R = Op.getOperand(0); 8797 SDValue Amt = Op.getOperand(1); 8798 8799 LLVMContext *Context = DAG.getContext(); 8800 8801 // Must have SSE2. 8802 if (!Subtarget->hasSSE2()) return SDValue(); 8803 8804 // Optimize shl/srl/sra with constant shift amount. 8805 if (isSplatVector(Amt.getNode())) { 8806 SDValue SclrAmt = Amt->getOperand(0); 8807 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 8808 uint64_t ShiftAmt = C->getZExtValue(); 8809 8810 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 8811 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8812 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8813 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8814 8815 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 8816 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8817 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8818 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8819 8820 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 8821 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8822 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8823 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8824 8825 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 8826 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8827 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8828 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8829 8830 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 8831 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8832 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8833 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8834 8835 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 8836 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8837 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8838 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8839 8840 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 8841 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8842 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8843 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8844 8845 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 8846 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8847 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8848 R, DAG.getConstant(ShiftAmt, MVT::i32)); 8849 } 8850 } 8851 8852 // Lower SHL with variable shift amount. 8853 // Cannot lower SHL without SSE4.1 or later. 8854 if (!Subtarget->hasSSE41()) return SDValue(); 8855 8856 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 8857 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8858 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8859 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8860 8861 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8862 8863 std::vector<Constant*> CV(4, CI); 8864 Constant *C = ConstantVector::get(CV); 8865 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8866 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8867 MachinePointerInfo::getConstantPool(), 8868 false, false, 16); 8869 8870 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8871 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 8872 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8873 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8874 } 8875 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 8876 // a = a << 5; 8877 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8878 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8879 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8880 8881 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8882 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8883 8884 std::vector<Constant*> CVM1(16, CM1); 8885 std::vector<Constant*> CVM2(16, CM2); 8886 Constant *C = ConstantVector::get(CVM1); 8887 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8888 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8889 MachinePointerInfo::getConstantPool(), 8890 false, false, 16); 8891 8892 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8893 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8894 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8895 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8896 DAG.getConstant(4, MVT::i32)); 8897 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8898 // a += a 8899 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8900 8901 C = ConstantVector::get(CVM2); 8902 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8903 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8904 MachinePointerInfo::getConstantPool(), 8905 false, false, 16); 8906 8907 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8908 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8909 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8910 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8911 DAG.getConstant(2, MVT::i32)); 8912 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8913 // a += a 8914 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8915 8916 // return pblendv(r, r+r, a); 8917 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 8918 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8919 return R; 8920 } 8921 return SDValue(); 8922} 8923 8924SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8925 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8926 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8927 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8928 // has only one use. 8929 SDNode *N = Op.getNode(); 8930 SDValue LHS = N->getOperand(0); 8931 SDValue RHS = N->getOperand(1); 8932 unsigned BaseOp = 0; 8933 unsigned Cond = 0; 8934 DebugLoc DL = Op.getDebugLoc(); 8935 switch (Op.getOpcode()) { 8936 default: llvm_unreachable("Unknown ovf instruction!"); 8937 case ISD::SADDO: 8938 // A subtract of one will be selected as a INC. Note that INC doesn't 8939 // set CF, so we can't do this for UADDO. 8940 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 8941 if (C->isOne()) { 8942 BaseOp = X86ISD::INC; 8943 Cond = X86::COND_O; 8944 break; 8945 } 8946 BaseOp = X86ISD::ADD; 8947 Cond = X86::COND_O; 8948 break; 8949 case ISD::UADDO: 8950 BaseOp = X86ISD::ADD; 8951 Cond = X86::COND_B; 8952 break; 8953 case ISD::SSUBO: 8954 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8955 // set CF, so we can't do this for USUBO. 8956 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 8957 if (C->isOne()) { 8958 BaseOp = X86ISD::DEC; 8959 Cond = X86::COND_O; 8960 break; 8961 } 8962 BaseOp = X86ISD::SUB; 8963 Cond = X86::COND_O; 8964 break; 8965 case ISD::USUBO: 8966 BaseOp = X86ISD::SUB; 8967 Cond = X86::COND_B; 8968 break; 8969 case ISD::SMULO: 8970 BaseOp = X86ISD::SMUL; 8971 Cond = X86::COND_O; 8972 break; 8973 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 8974 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 8975 MVT::i32); 8976 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 8977 8978 SDValue SetCC = 8979 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8980 DAG.getConstant(X86::COND_O, MVT::i32), 8981 SDValue(Sum.getNode(), 2)); 8982 8983 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8984 return Sum; 8985 } 8986 } 8987 8988 // Also sets EFLAGS. 8989 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8990 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 8991 8992 SDValue SetCC = 8993 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 8994 DAG.getConstant(Cond, MVT::i32), 8995 SDValue(Sum.getNode(), 1)); 8996 8997 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8998 return Sum; 8999} 9000 9001SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 9002 DebugLoc dl = Op.getDebugLoc(); 9003 9004 if (!Subtarget->hasSSE2()) { 9005 SDValue Chain = Op.getOperand(0); 9006 SDValue Zero = DAG.getConstant(0, 9007 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 9008 SDValue Ops[] = { 9009 DAG.getRegister(X86::ESP, MVT::i32), // Base 9010 DAG.getTargetConstant(1, MVT::i8), // Scale 9011 DAG.getRegister(0, MVT::i32), // Index 9012 DAG.getTargetConstant(0, MVT::i32), // Disp 9013 DAG.getRegister(0, MVT::i32), // Segment. 9014 Zero, 9015 Chain 9016 }; 9017 SDNode *Res = 9018 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9019 array_lengthof(Ops)); 9020 return SDValue(Res, 0); 9021 } 9022 9023 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 9024 if (!isDev) 9025 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9026 9027 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 9028 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 9029 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 9030 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 9031 9032 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 9033 if (!Op1 && !Op2 && !Op3 && Op4) 9034 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 9035 9036 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 9037 if (Op1 && !Op2 && !Op3 && !Op4) 9038 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 9039 9040 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 9041 // (MFENCE)>; 9042 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9043} 9044 9045SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 9046 EVT T = Op.getValueType(); 9047 DebugLoc DL = Op.getDebugLoc(); 9048 unsigned Reg = 0; 9049 unsigned size = 0; 9050 switch(T.getSimpleVT().SimpleTy) { 9051 default: 9052 assert(false && "Invalid value type!"); 9053 case MVT::i8: Reg = X86::AL; size = 1; break; 9054 case MVT::i16: Reg = X86::AX; size = 2; break; 9055 case MVT::i32: Reg = X86::EAX; size = 4; break; 9056 case MVT::i64: 9057 assert(Subtarget->is64Bit() && "Node not type legal!"); 9058 Reg = X86::RAX; size = 8; 9059 break; 9060 } 9061 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 9062 Op.getOperand(2), SDValue()); 9063 SDValue Ops[] = { cpIn.getValue(0), 9064 Op.getOperand(1), 9065 Op.getOperand(3), 9066 DAG.getTargetConstant(size, MVT::i8), 9067 cpIn.getValue(1) }; 9068 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9069 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 9070 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 9071 Ops, 5, T, MMO); 9072 SDValue cpOut = 9073 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 9074 return cpOut; 9075} 9076 9077SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 9078 SelectionDAG &DAG) const { 9079 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9080 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9081 SDValue TheChain = Op.getOperand(0); 9082 DebugLoc dl = Op.getDebugLoc(); 9083 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9084 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9085 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9086 rax.getValue(2)); 9087 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9088 DAG.getConstant(32, MVT::i8)); 9089 SDValue Ops[] = { 9090 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9091 rdx.getValue(1) 9092 }; 9093 return DAG.getMergeValues(Ops, 2, dl); 9094} 9095 9096SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9097 SelectionDAG &DAG) const { 9098 EVT SrcVT = Op.getOperand(0).getValueType(); 9099 EVT DstVT = Op.getValueType(); 9100 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9101 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9102 assert((DstVT == MVT::i64 || 9103 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9104 "Unexpected custom BITCAST"); 9105 // i64 <=> MMX conversions are Legal. 9106 if (SrcVT==MVT::i64 && DstVT.isVector()) 9107 return Op; 9108 if (DstVT==MVT::i64 && SrcVT.isVector()) 9109 return Op; 9110 // MMX <=> MMX conversions are Legal. 9111 if (SrcVT.isVector() && DstVT.isVector()) 9112 return Op; 9113 // All other conversions need to be expanded. 9114 return SDValue(); 9115} 9116 9117SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9118 SDNode *Node = Op.getNode(); 9119 DebugLoc dl = Node->getDebugLoc(); 9120 EVT T = Node->getValueType(0); 9121 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9122 DAG.getConstant(0, T), Node->getOperand(2)); 9123 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9124 cast<AtomicSDNode>(Node)->getMemoryVT(), 9125 Node->getOperand(0), 9126 Node->getOperand(1), negOp, 9127 cast<AtomicSDNode>(Node)->getSrcValue(), 9128 cast<AtomicSDNode>(Node)->getAlignment()); 9129} 9130 9131static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9132 EVT VT = Op.getNode()->getValueType(0); 9133 9134 // Let legalize expand this if it isn't a legal type yet. 9135 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9136 return SDValue(); 9137 9138 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9139 9140 unsigned Opc; 9141 bool ExtraOp = false; 9142 switch (Op.getOpcode()) { 9143 default: assert(0 && "Invalid code"); 9144 case ISD::ADDC: Opc = X86ISD::ADD; break; 9145 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9146 case ISD::SUBC: Opc = X86ISD::SUB; break; 9147 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9148 } 9149 9150 if (!ExtraOp) 9151 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9152 Op.getOperand(1)); 9153 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9154 Op.getOperand(1), Op.getOperand(2)); 9155} 9156 9157/// LowerOperation - Provide custom lowering hooks for some operations. 9158/// 9159SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9160 switch (Op.getOpcode()) { 9161 default: llvm_unreachable("Should not custom lower this!"); 9162 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9163 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9164 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9165 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9166 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9167 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9168 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9169 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9170 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9171 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9172 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9173 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9174 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9175 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9176 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9177 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9178 case ISD::SHL_PARTS: 9179 case ISD::SRA_PARTS: 9180 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 9181 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9182 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9183 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9184 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9185 case ISD::FABS: return LowerFABS(Op, DAG); 9186 case ISD::FNEG: return LowerFNEG(Op, DAG); 9187 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9188 case ISD::SETCC: return LowerSETCC(Op, DAG); 9189 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9190 case ISD::SELECT: return LowerSELECT(Op, DAG); 9191 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9192 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9193 case ISD::VASTART: return LowerVASTART(Op, DAG); 9194 case ISD::VAARG: return LowerVAARG(Op, DAG); 9195 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9196 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9197 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9198 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9199 case ISD::FRAME_TO_ARGS_OFFSET: 9200 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9201 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9202 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9203 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9204 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9205 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9206 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9207 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9208 case ISD::SRA: 9209 case ISD::SRL: 9210 case ISD::SHL: return LowerShift(Op, DAG); 9211 case ISD::SADDO: 9212 case ISD::UADDO: 9213 case ISD::SSUBO: 9214 case ISD::USUBO: 9215 case ISD::SMULO: 9216 case ISD::UMULO: return LowerXALUO(Op, DAG); 9217 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9218 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9219 case ISD::ADDC: 9220 case ISD::ADDE: 9221 case ISD::SUBC: 9222 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9223 } 9224} 9225 9226void X86TargetLowering:: 9227ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9228 SelectionDAG &DAG, unsigned NewOp) const { 9229 EVT T = Node->getValueType(0); 9230 DebugLoc dl = Node->getDebugLoc(); 9231 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9232 9233 SDValue Chain = Node->getOperand(0); 9234 SDValue In1 = Node->getOperand(1); 9235 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9236 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9237 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9238 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9239 SDValue Ops[] = { Chain, In1, In2L, In2H }; 9240 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9241 SDValue Result = 9242 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 9243 cast<MemSDNode>(Node)->getMemOperand()); 9244 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 9245 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9246 Results.push_back(Result.getValue(2)); 9247} 9248 9249/// ReplaceNodeResults - Replace a node with an illegal result type 9250/// with a new node built out of custom code. 9251void X86TargetLowering::ReplaceNodeResults(SDNode *N, 9252 SmallVectorImpl<SDValue>&Results, 9253 SelectionDAG &DAG) const { 9254 DebugLoc dl = N->getDebugLoc(); 9255 switch (N->getOpcode()) { 9256 default: 9257 assert(false && "Do not know how to custom type legalize this operation!"); 9258 return; 9259 case ISD::ADDC: 9260 case ISD::ADDE: 9261 case ISD::SUBC: 9262 case ISD::SUBE: 9263 // We don't want to expand or promote these. 9264 return; 9265 case ISD::FP_TO_SINT: { 9266 std::pair<SDValue,SDValue> Vals = 9267 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 9268 SDValue FIST = Vals.first, StackSlot = Vals.second; 9269 if (FIST.getNode() != 0) { 9270 EVT VT = N->getValueType(0); 9271 // Return a load from the stack slot. 9272 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 9273 MachinePointerInfo(), false, false, 0)); 9274 } 9275 return; 9276 } 9277 case ISD::READCYCLECOUNTER: { 9278 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9279 SDValue TheChain = N->getOperand(0); 9280 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9281 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 9282 rd.getValue(1)); 9283 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 9284 eax.getValue(2)); 9285 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 9286 SDValue Ops[] = { eax, edx }; 9287 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 9288 Results.push_back(edx.getValue(1)); 9289 return; 9290 } 9291 case ISD::ATOMIC_CMP_SWAP: { 9292 EVT T = N->getValueType(0); 9293 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 9294 SDValue cpInL, cpInH; 9295 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9296 DAG.getConstant(0, MVT::i32)); 9297 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9298 DAG.getConstant(1, MVT::i32)); 9299 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 9300 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 9301 cpInL.getValue(1)); 9302 SDValue swapInL, swapInH; 9303 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9304 DAG.getConstant(0, MVT::i32)); 9305 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9306 DAG.getConstant(1, MVT::i32)); 9307 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 9308 cpInH.getValue(1)); 9309 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 9310 swapInL.getValue(1)); 9311 SDValue Ops[] = { swapInH.getValue(0), 9312 N->getOperand(1), 9313 swapInH.getValue(1) }; 9314 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9315 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 9316 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 9317 Ops, 3, T, MMO); 9318 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 9319 MVT::i32, Result.getValue(1)); 9320 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 9321 MVT::i32, cpOutL.getValue(2)); 9322 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 9323 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9324 Results.push_back(cpOutH.getValue(1)); 9325 return; 9326 } 9327 case ISD::ATOMIC_LOAD_ADD: 9328 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 9329 return; 9330 case ISD::ATOMIC_LOAD_AND: 9331 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 9332 return; 9333 case ISD::ATOMIC_LOAD_NAND: 9334 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 9335 return; 9336 case ISD::ATOMIC_LOAD_OR: 9337 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 9338 return; 9339 case ISD::ATOMIC_LOAD_SUB: 9340 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 9341 return; 9342 case ISD::ATOMIC_LOAD_XOR: 9343 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 9344 return; 9345 case ISD::ATOMIC_SWAP: 9346 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 9347 return; 9348 } 9349} 9350 9351const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 9352 switch (Opcode) { 9353 default: return NULL; 9354 case X86ISD::BSF: return "X86ISD::BSF"; 9355 case X86ISD::BSR: return "X86ISD::BSR"; 9356 case X86ISD::SHLD: return "X86ISD::SHLD"; 9357 case X86ISD::SHRD: return "X86ISD::SHRD"; 9358 case X86ISD::FAND: return "X86ISD::FAND"; 9359 case X86ISD::FOR: return "X86ISD::FOR"; 9360 case X86ISD::FXOR: return "X86ISD::FXOR"; 9361 case X86ISD::FSRL: return "X86ISD::FSRL"; 9362 case X86ISD::FILD: return "X86ISD::FILD"; 9363 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 9364 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 9365 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 9366 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 9367 case X86ISD::FLD: return "X86ISD::FLD"; 9368 case X86ISD::FST: return "X86ISD::FST"; 9369 case X86ISD::CALL: return "X86ISD::CALL"; 9370 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 9371 case X86ISD::BT: return "X86ISD::BT"; 9372 case X86ISD::CMP: return "X86ISD::CMP"; 9373 case X86ISD::COMI: return "X86ISD::COMI"; 9374 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 9375 case X86ISD::SETCC: return "X86ISD::SETCC"; 9376 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 9377 case X86ISD::CMOV: return "X86ISD::CMOV"; 9378 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 9379 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 9380 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 9381 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 9382 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 9383 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 9384 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 9385 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 9386 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 9387 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 9388 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 9389 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 9390 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 9391 case X86ISD::PANDN: return "X86ISD::PANDN"; 9392 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 9393 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 9394 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 9395 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 9396 case X86ISD::FMAX: return "X86ISD::FMAX"; 9397 case X86ISD::FMIN: return "X86ISD::FMIN"; 9398 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9399 case X86ISD::FRCP: return "X86ISD::FRCP"; 9400 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9401 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9402 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9403 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9404 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9405 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9406 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9407 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9408 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9409 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9410 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9411 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9412 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9413 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9414 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9415 case X86ISD::VSHL: return "X86ISD::VSHL"; 9416 case X86ISD::VSRL: return "X86ISD::VSRL"; 9417 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9418 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9419 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9420 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9421 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9422 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9423 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9424 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9425 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9426 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9427 case X86ISD::ADD: return "X86ISD::ADD"; 9428 case X86ISD::SUB: return "X86ISD::SUB"; 9429 case X86ISD::ADC: return "X86ISD::ADC"; 9430 case X86ISD::SBB: return "X86ISD::SBB"; 9431 case X86ISD::SMUL: return "X86ISD::SMUL"; 9432 case X86ISD::UMUL: return "X86ISD::UMUL"; 9433 case X86ISD::INC: return "X86ISD::INC"; 9434 case X86ISD::DEC: return "X86ISD::DEC"; 9435 case X86ISD::OR: return "X86ISD::OR"; 9436 case X86ISD::XOR: return "X86ISD::XOR"; 9437 case X86ISD::AND: return "X86ISD::AND"; 9438 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 9439 case X86ISD::PTEST: return "X86ISD::PTEST"; 9440 case X86ISD::TESTP: return "X86ISD::TESTP"; 9441 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 9442 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 9443 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 9444 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 9445 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 9446 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 9447 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 9448 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 9449 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 9450 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 9451 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 9452 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 9453 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 9454 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 9455 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 9456 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 9457 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 9458 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 9459 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 9460 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 9461 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 9462 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 9463 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 9464 case X86ISD::VUNPCKLPS: return "X86ISD::VUNPCKLPS"; 9465 case X86ISD::VUNPCKLPD: return "X86ISD::VUNPCKLPD"; 9466 case X86ISD::VUNPCKLPSY: return "X86ISD::VUNPCKLPSY"; 9467 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 9468 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 9469 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 9470 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 9471 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 9472 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 9473 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 9474 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9475 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9476 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9477 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9478 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9479 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9480 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9481 } 9482} 9483 9484// isLegalAddressingMode - Return true if the addressing mode represented 9485// by AM is legal for this target, for a load/store of the specified type. 9486bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9487 const Type *Ty) const { 9488 // X86 supports extremely general addressing modes. 9489 CodeModel::Model M = getTargetMachine().getCodeModel(); 9490 Reloc::Model R = getTargetMachine().getRelocationModel(); 9491 9492 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9493 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9494 return false; 9495 9496 if (AM.BaseGV) { 9497 unsigned GVFlags = 9498 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9499 9500 // If a reference to this global requires an extra load, we can't fold it. 9501 if (isGlobalStubReference(GVFlags)) 9502 return false; 9503 9504 // If BaseGV requires a register for the PIC base, we cannot also have a 9505 // BaseReg specified. 9506 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9507 return false; 9508 9509 // If lower 4G is not available, then we must use rip-relative addressing. 9510 if ((M != CodeModel::Small || R != Reloc::Static) && 9511 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9512 return false; 9513 } 9514 9515 switch (AM.Scale) { 9516 case 0: 9517 case 1: 9518 case 2: 9519 case 4: 9520 case 8: 9521 // These scales always work. 9522 break; 9523 case 3: 9524 case 5: 9525 case 9: 9526 // These scales are formed with basereg+scalereg. Only accept if there is 9527 // no basereg yet. 9528 if (AM.HasBaseReg) 9529 return false; 9530 break; 9531 default: // Other stuff never works. 9532 return false; 9533 } 9534 9535 return true; 9536} 9537 9538 9539bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 9540 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9541 return false; 9542 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9543 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9544 if (NumBits1 <= NumBits2) 9545 return false; 9546 return true; 9547} 9548 9549bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9550 if (!VT1.isInteger() || !VT2.isInteger()) 9551 return false; 9552 unsigned NumBits1 = VT1.getSizeInBits(); 9553 unsigned NumBits2 = VT2.getSizeInBits(); 9554 if (NumBits1 <= NumBits2) 9555 return false; 9556 return true; 9557} 9558 9559bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 9560 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9561 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9562} 9563 9564bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9565 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9566 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9567} 9568 9569bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9570 // i16 instructions are longer (0x66 prefix) and potentially slower. 9571 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9572} 9573 9574/// isShuffleMaskLegal - Targets can use this to indicate that they only 9575/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9576/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9577/// are assumed to be legal. 9578bool 9579X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9580 EVT VT) const { 9581 // Very little shuffling can be done for 64-bit vectors right now. 9582 if (VT.getSizeInBits() == 64) 9583 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9584 9585 // FIXME: pshufb, blends, shifts. 9586 return (VT.getVectorNumElements() == 2 || 9587 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9588 isMOVLMask(M, VT) || 9589 isSHUFPMask(M, VT) || 9590 isPSHUFDMask(M, VT) || 9591 isPSHUFHWMask(M, VT) || 9592 isPSHUFLWMask(M, VT) || 9593 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9594 isUNPCKLMask(M, VT) || 9595 isUNPCKHMask(M, VT) || 9596 isUNPCKL_v_undef_Mask(M, VT) || 9597 isUNPCKH_v_undef_Mask(M, VT)); 9598} 9599 9600bool 9601X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9602 EVT VT) const { 9603 unsigned NumElts = VT.getVectorNumElements(); 9604 // FIXME: This collection of masks seems suspect. 9605 if (NumElts == 2) 9606 return true; 9607 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9608 return (isMOVLMask(Mask, VT) || 9609 isCommutedMOVLMask(Mask, VT, true) || 9610 isSHUFPMask(Mask, VT) || 9611 isCommutedSHUFPMask(Mask, VT)); 9612 } 9613 return false; 9614} 9615 9616//===----------------------------------------------------------------------===// 9617// X86 Scheduler Hooks 9618//===----------------------------------------------------------------------===// 9619 9620// private utility function 9621MachineBasicBlock * 9622X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9623 MachineBasicBlock *MBB, 9624 unsigned regOpc, 9625 unsigned immOpc, 9626 unsigned LoadOpc, 9627 unsigned CXchgOpc, 9628 unsigned notOpc, 9629 unsigned EAXreg, 9630 TargetRegisterClass *RC, 9631 bool invSrc) const { 9632 // For the atomic bitwise operator, we generate 9633 // thisMBB: 9634 // newMBB: 9635 // ld t1 = [bitinstr.addr] 9636 // op t2 = t1, [bitinstr.val] 9637 // mov EAX = t1 9638 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9639 // bz newMBB 9640 // fallthrough -->nextMBB 9641 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9642 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9643 MachineFunction::iterator MBBIter = MBB; 9644 ++MBBIter; 9645 9646 /// First build the CFG 9647 MachineFunction *F = MBB->getParent(); 9648 MachineBasicBlock *thisMBB = MBB; 9649 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9650 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9651 F->insert(MBBIter, newMBB); 9652 F->insert(MBBIter, nextMBB); 9653 9654 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9655 nextMBB->splice(nextMBB->begin(), thisMBB, 9656 llvm::next(MachineBasicBlock::iterator(bInstr)), 9657 thisMBB->end()); 9658 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9659 9660 // Update thisMBB to fall through to newMBB 9661 thisMBB->addSuccessor(newMBB); 9662 9663 // newMBB jumps to itself and fall through to nextMBB 9664 newMBB->addSuccessor(nextMBB); 9665 newMBB->addSuccessor(newMBB); 9666 9667 // Insert instructions into newMBB based on incoming instruction 9668 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9669 "unexpected number of operands"); 9670 DebugLoc dl = bInstr->getDebugLoc(); 9671 MachineOperand& destOper = bInstr->getOperand(0); 9672 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9673 int numArgs = bInstr->getNumOperands() - 1; 9674 for (int i=0; i < numArgs; ++i) 9675 argOpers[i] = &bInstr->getOperand(i+1); 9676 9677 // x86 address has 4 operands: base, index, scale, and displacement 9678 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9679 int valArgIndx = lastAddrIndx + 1; 9680 9681 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9682 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9683 for (int i=0; i <= lastAddrIndx; ++i) 9684 (*MIB).addOperand(*argOpers[i]); 9685 9686 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9687 if (invSrc) { 9688 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9689 } 9690 else 9691 tt = t1; 9692 9693 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9694 assert((argOpers[valArgIndx]->isReg() || 9695 argOpers[valArgIndx]->isImm()) && 9696 "invalid operand"); 9697 if (argOpers[valArgIndx]->isReg()) 9698 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9699 else 9700 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9701 MIB.addReg(tt); 9702 (*MIB).addOperand(*argOpers[valArgIndx]); 9703 9704 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9705 MIB.addReg(t1); 9706 9707 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9708 for (int i=0; i <= lastAddrIndx; ++i) 9709 (*MIB).addOperand(*argOpers[i]); 9710 MIB.addReg(t2); 9711 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9712 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9713 bInstr->memoperands_end()); 9714 9715 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9716 MIB.addReg(EAXreg); 9717 9718 // insert branch 9719 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9720 9721 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9722 return nextMBB; 9723} 9724 9725// private utility function: 64 bit atomics on 32 bit host. 9726MachineBasicBlock * 9727X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9728 MachineBasicBlock *MBB, 9729 unsigned regOpcL, 9730 unsigned regOpcH, 9731 unsigned immOpcL, 9732 unsigned immOpcH, 9733 bool invSrc) const { 9734 // For the atomic bitwise operator, we generate 9735 // thisMBB (instructions are in pairs, except cmpxchg8b) 9736 // ld t1,t2 = [bitinstr.addr] 9737 // newMBB: 9738 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9739 // op t5, t6 <- out1, out2, [bitinstr.val] 9740 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9741 // mov ECX, EBX <- t5, t6 9742 // mov EAX, EDX <- t1, t2 9743 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9744 // mov t3, t4 <- EAX, EDX 9745 // bz newMBB 9746 // result in out1, out2 9747 // fallthrough -->nextMBB 9748 9749 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9750 const unsigned LoadOpc = X86::MOV32rm; 9751 const unsigned NotOpc = X86::NOT32r; 9752 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9753 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9754 MachineFunction::iterator MBBIter = MBB; 9755 ++MBBIter; 9756 9757 /// First build the CFG 9758 MachineFunction *F = MBB->getParent(); 9759 MachineBasicBlock *thisMBB = MBB; 9760 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9761 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9762 F->insert(MBBIter, newMBB); 9763 F->insert(MBBIter, nextMBB); 9764 9765 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9766 nextMBB->splice(nextMBB->begin(), thisMBB, 9767 llvm::next(MachineBasicBlock::iterator(bInstr)), 9768 thisMBB->end()); 9769 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9770 9771 // Update thisMBB to fall through to newMBB 9772 thisMBB->addSuccessor(newMBB); 9773 9774 // newMBB jumps to itself and fall through to nextMBB 9775 newMBB->addSuccessor(nextMBB); 9776 newMBB->addSuccessor(newMBB); 9777 9778 DebugLoc dl = bInstr->getDebugLoc(); 9779 // Insert instructions into newMBB based on incoming instruction 9780 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9781 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9782 "unexpected number of operands"); 9783 MachineOperand& dest1Oper = bInstr->getOperand(0); 9784 MachineOperand& dest2Oper = bInstr->getOperand(1); 9785 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9786 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9787 argOpers[i] = &bInstr->getOperand(i+2); 9788 9789 // We use some of the operands multiple times, so conservatively just 9790 // clear any kill flags that might be present. 9791 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9792 argOpers[i]->setIsKill(false); 9793 } 9794 9795 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9796 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9797 9798 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9799 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9800 for (int i=0; i <= lastAddrIndx; ++i) 9801 (*MIB).addOperand(*argOpers[i]); 9802 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9803 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9804 // add 4 to displacement. 9805 for (int i=0; i <= lastAddrIndx-2; ++i) 9806 (*MIB).addOperand(*argOpers[i]); 9807 MachineOperand newOp3 = *(argOpers[3]); 9808 if (newOp3.isImm()) 9809 newOp3.setImm(newOp3.getImm()+4); 9810 else 9811 newOp3.setOffset(newOp3.getOffset()+4); 9812 (*MIB).addOperand(newOp3); 9813 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9814 9815 // t3/4 are defined later, at the bottom of the loop 9816 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9817 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9818 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9819 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9820 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9821 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9822 9823 // The subsequent operations should be using the destination registers of 9824 //the PHI instructions. 9825 if (invSrc) { 9826 t1 = F->getRegInfo().createVirtualRegister(RC); 9827 t2 = F->getRegInfo().createVirtualRegister(RC); 9828 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9829 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9830 } else { 9831 t1 = dest1Oper.getReg(); 9832 t2 = dest2Oper.getReg(); 9833 } 9834 9835 int valArgIndx = lastAddrIndx + 1; 9836 assert((argOpers[valArgIndx]->isReg() || 9837 argOpers[valArgIndx]->isImm()) && 9838 "invalid operand"); 9839 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9840 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9841 if (argOpers[valArgIndx]->isReg()) 9842 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9843 else 9844 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9845 if (regOpcL != X86::MOV32rr) 9846 MIB.addReg(t1); 9847 (*MIB).addOperand(*argOpers[valArgIndx]); 9848 assert(argOpers[valArgIndx + 1]->isReg() == 9849 argOpers[valArgIndx]->isReg()); 9850 assert(argOpers[valArgIndx + 1]->isImm() == 9851 argOpers[valArgIndx]->isImm()); 9852 if (argOpers[valArgIndx + 1]->isReg()) 9853 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9854 else 9855 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9856 if (regOpcH != X86::MOV32rr) 9857 MIB.addReg(t2); 9858 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9859 9860 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9861 MIB.addReg(t1); 9862 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9863 MIB.addReg(t2); 9864 9865 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9866 MIB.addReg(t5); 9867 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9868 MIB.addReg(t6); 9869 9870 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9871 for (int i=0; i <= lastAddrIndx; ++i) 9872 (*MIB).addOperand(*argOpers[i]); 9873 9874 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9875 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9876 bInstr->memoperands_end()); 9877 9878 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9879 MIB.addReg(X86::EAX); 9880 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9881 MIB.addReg(X86::EDX); 9882 9883 // insert branch 9884 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9885 9886 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9887 return nextMBB; 9888} 9889 9890// private utility function 9891MachineBasicBlock * 9892X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9893 MachineBasicBlock *MBB, 9894 unsigned cmovOpc) const { 9895 // For the atomic min/max operator, we generate 9896 // thisMBB: 9897 // newMBB: 9898 // ld t1 = [min/max.addr] 9899 // mov t2 = [min/max.val] 9900 // cmp t1, t2 9901 // cmov[cond] t2 = t1 9902 // mov EAX = t1 9903 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9904 // bz newMBB 9905 // fallthrough -->nextMBB 9906 // 9907 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9908 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9909 MachineFunction::iterator MBBIter = MBB; 9910 ++MBBIter; 9911 9912 /// First build the CFG 9913 MachineFunction *F = MBB->getParent(); 9914 MachineBasicBlock *thisMBB = MBB; 9915 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9916 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9917 F->insert(MBBIter, newMBB); 9918 F->insert(MBBIter, nextMBB); 9919 9920 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9921 nextMBB->splice(nextMBB->begin(), thisMBB, 9922 llvm::next(MachineBasicBlock::iterator(mInstr)), 9923 thisMBB->end()); 9924 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9925 9926 // Update thisMBB to fall through to newMBB 9927 thisMBB->addSuccessor(newMBB); 9928 9929 // newMBB jumps to newMBB and fall through to nextMBB 9930 newMBB->addSuccessor(nextMBB); 9931 newMBB->addSuccessor(newMBB); 9932 9933 DebugLoc dl = mInstr->getDebugLoc(); 9934 // Insert instructions into newMBB based on incoming instruction 9935 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9936 "unexpected number of operands"); 9937 MachineOperand& destOper = mInstr->getOperand(0); 9938 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9939 int numArgs = mInstr->getNumOperands() - 1; 9940 for (int i=0; i < numArgs; ++i) 9941 argOpers[i] = &mInstr->getOperand(i+1); 9942 9943 // x86 address has 4 operands: base, index, scale, and displacement 9944 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9945 int valArgIndx = lastAddrIndx + 1; 9946 9947 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9948 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9949 for (int i=0; i <= lastAddrIndx; ++i) 9950 (*MIB).addOperand(*argOpers[i]); 9951 9952 // We only support register and immediate values 9953 assert((argOpers[valArgIndx]->isReg() || 9954 argOpers[valArgIndx]->isImm()) && 9955 "invalid operand"); 9956 9957 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9958 if (argOpers[valArgIndx]->isReg()) 9959 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9960 else 9961 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9962 (*MIB).addOperand(*argOpers[valArgIndx]); 9963 9964 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9965 MIB.addReg(t1); 9966 9967 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9968 MIB.addReg(t1); 9969 MIB.addReg(t2); 9970 9971 // Generate movc 9972 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9973 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9974 MIB.addReg(t2); 9975 MIB.addReg(t1); 9976 9977 // Cmp and exchange if none has modified the memory location 9978 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9979 for (int i=0; i <= lastAddrIndx; ++i) 9980 (*MIB).addOperand(*argOpers[i]); 9981 MIB.addReg(t3); 9982 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9983 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9984 mInstr->memoperands_end()); 9985 9986 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9987 MIB.addReg(X86::EAX); 9988 9989 // insert branch 9990 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9991 9992 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9993 return nextMBB; 9994} 9995 9996// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9997// or XMM0_V32I8 in AVX all of this code can be replaced with that 9998// in the .td file. 9999MachineBasicBlock * 10000X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 10001 unsigned numArgs, bool memArg) const { 10002 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 10003 "Target must have SSE4.2 or AVX features enabled"); 10004 10005 DebugLoc dl = MI->getDebugLoc(); 10006 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10007 unsigned Opc; 10008 if (!Subtarget->hasAVX()) { 10009 if (memArg) 10010 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 10011 else 10012 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 10013 } else { 10014 if (memArg) 10015 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 10016 else 10017 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 10018 } 10019 10020 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 10021 for (unsigned i = 0; i < numArgs; ++i) { 10022 MachineOperand &Op = MI->getOperand(i+1); 10023 if (!(Op.isReg() && Op.isImplicit())) 10024 MIB.addOperand(Op); 10025 } 10026 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 10027 .addReg(X86::XMM0); 10028 10029 MI->eraseFromParent(); 10030 return BB; 10031} 10032 10033MachineBasicBlock * 10034X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 10035 DebugLoc dl = MI->getDebugLoc(); 10036 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10037 10038 // Address into RAX/EAX, other two args into ECX, EDX. 10039 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 10040 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 10041 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 10042 for (int i = 0; i < X86::AddrNumOperands; ++i) 10043 MIB.addOperand(MI->getOperand(i)); 10044 10045 unsigned ValOps = X86::AddrNumOperands; 10046 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10047 .addReg(MI->getOperand(ValOps).getReg()); 10048 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 10049 .addReg(MI->getOperand(ValOps+1).getReg()); 10050 10051 // The instruction doesn't actually take any operands though. 10052 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 10053 10054 MI->eraseFromParent(); // The pseudo is gone now. 10055 return BB; 10056} 10057 10058MachineBasicBlock * 10059X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 10060 DebugLoc dl = MI->getDebugLoc(); 10061 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10062 10063 // First arg in ECX, the second in EAX. 10064 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10065 .addReg(MI->getOperand(0).getReg()); 10066 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 10067 .addReg(MI->getOperand(1).getReg()); 10068 10069 // The instruction doesn't actually take any operands though. 10070 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 10071 10072 MI->eraseFromParent(); // The pseudo is gone now. 10073 return BB; 10074} 10075 10076MachineBasicBlock * 10077X86TargetLowering::EmitVAARG64WithCustomInserter( 10078 MachineInstr *MI, 10079 MachineBasicBlock *MBB) const { 10080 // Emit va_arg instruction on X86-64. 10081 10082 // Operands to this pseudo-instruction: 10083 // 0 ) Output : destination address (reg) 10084 // 1-5) Input : va_list address (addr, i64mem) 10085 // 6 ) ArgSize : Size (in bytes) of vararg type 10086 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10087 // 8 ) Align : Alignment of type 10088 // 9 ) EFLAGS (implicit-def) 10089 10090 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10091 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10092 10093 unsigned DestReg = MI->getOperand(0).getReg(); 10094 MachineOperand &Base = MI->getOperand(1); 10095 MachineOperand &Scale = MI->getOperand(2); 10096 MachineOperand &Index = MI->getOperand(3); 10097 MachineOperand &Disp = MI->getOperand(4); 10098 MachineOperand &Segment = MI->getOperand(5); 10099 unsigned ArgSize = MI->getOperand(6).getImm(); 10100 unsigned ArgMode = MI->getOperand(7).getImm(); 10101 unsigned Align = MI->getOperand(8).getImm(); 10102 10103 // Memory Reference 10104 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10105 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10106 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10107 10108 // Machine Information 10109 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10110 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10111 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10112 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10113 DebugLoc DL = MI->getDebugLoc(); 10114 10115 // struct va_list { 10116 // i32 gp_offset 10117 // i32 fp_offset 10118 // i64 overflow_area (address) 10119 // i64 reg_save_area (address) 10120 // } 10121 // sizeof(va_list) = 24 10122 // alignment(va_list) = 8 10123 10124 unsigned TotalNumIntRegs = 6; 10125 unsigned TotalNumXMMRegs = 8; 10126 bool UseGPOffset = (ArgMode == 1); 10127 bool UseFPOffset = (ArgMode == 2); 10128 unsigned MaxOffset = TotalNumIntRegs * 8 + 10129 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10130 10131 /* Align ArgSize to a multiple of 8 */ 10132 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10133 bool NeedsAlign = (Align > 8); 10134 10135 MachineBasicBlock *thisMBB = MBB; 10136 MachineBasicBlock *overflowMBB; 10137 MachineBasicBlock *offsetMBB; 10138 MachineBasicBlock *endMBB; 10139 10140 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10141 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10142 unsigned OffsetReg = 0; 10143 10144 if (!UseGPOffset && !UseFPOffset) { 10145 // If we only pull from the overflow region, we don't create a branch. 10146 // We don't need to alter control flow. 10147 OffsetDestReg = 0; // unused 10148 OverflowDestReg = DestReg; 10149 10150 offsetMBB = NULL; 10151 overflowMBB = thisMBB; 10152 endMBB = thisMBB; 10153 } else { 10154 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10155 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10156 // If not, pull from overflow_area. (branch to overflowMBB) 10157 // 10158 // thisMBB 10159 // | . 10160 // | . 10161 // offsetMBB overflowMBB 10162 // | . 10163 // | . 10164 // endMBB 10165 10166 // Registers for the PHI in endMBB 10167 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10168 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10169 10170 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10171 MachineFunction *MF = MBB->getParent(); 10172 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10173 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10174 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10175 10176 MachineFunction::iterator MBBIter = MBB; 10177 ++MBBIter; 10178 10179 // Insert the new basic blocks 10180 MF->insert(MBBIter, offsetMBB); 10181 MF->insert(MBBIter, overflowMBB); 10182 MF->insert(MBBIter, endMBB); 10183 10184 // Transfer the remainder of MBB and its successor edges to endMBB. 10185 endMBB->splice(endMBB->begin(), thisMBB, 10186 llvm::next(MachineBasicBlock::iterator(MI)), 10187 thisMBB->end()); 10188 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10189 10190 // Make offsetMBB and overflowMBB successors of thisMBB 10191 thisMBB->addSuccessor(offsetMBB); 10192 thisMBB->addSuccessor(overflowMBB); 10193 10194 // endMBB is a successor of both offsetMBB and overflowMBB 10195 offsetMBB->addSuccessor(endMBB); 10196 overflowMBB->addSuccessor(endMBB); 10197 10198 // Load the offset value into a register 10199 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10200 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10201 .addOperand(Base) 10202 .addOperand(Scale) 10203 .addOperand(Index) 10204 .addDisp(Disp, UseFPOffset ? 4 : 0) 10205 .addOperand(Segment) 10206 .setMemRefs(MMOBegin, MMOEnd); 10207 10208 // Check if there is enough room left to pull this argument. 10209 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10210 .addReg(OffsetReg) 10211 .addImm(MaxOffset + 8 - ArgSizeA8); 10212 10213 // Branch to "overflowMBB" if offset >= max 10214 // Fall through to "offsetMBB" otherwise 10215 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10216 .addMBB(overflowMBB); 10217 } 10218 10219 // In offsetMBB, emit code to use the reg_save_area. 10220 if (offsetMBB) { 10221 assert(OffsetReg != 0); 10222 10223 // Read the reg_save_area address. 10224 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10225 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10226 .addOperand(Base) 10227 .addOperand(Scale) 10228 .addOperand(Index) 10229 .addDisp(Disp, 16) 10230 .addOperand(Segment) 10231 .setMemRefs(MMOBegin, MMOEnd); 10232 10233 // Zero-extend the offset 10234 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 10235 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 10236 .addImm(0) 10237 .addReg(OffsetReg) 10238 .addImm(X86::sub_32bit); 10239 10240 // Add the offset to the reg_save_area to get the final address. 10241 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 10242 .addReg(OffsetReg64) 10243 .addReg(RegSaveReg); 10244 10245 // Compute the offset for the next argument 10246 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10247 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 10248 .addReg(OffsetReg) 10249 .addImm(UseFPOffset ? 16 : 8); 10250 10251 // Store it back into the va_list. 10252 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 10253 .addOperand(Base) 10254 .addOperand(Scale) 10255 .addOperand(Index) 10256 .addDisp(Disp, UseFPOffset ? 4 : 0) 10257 .addOperand(Segment) 10258 .addReg(NextOffsetReg) 10259 .setMemRefs(MMOBegin, MMOEnd); 10260 10261 // Jump to endMBB 10262 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 10263 .addMBB(endMBB); 10264 } 10265 10266 // 10267 // Emit code to use overflow area 10268 // 10269 10270 // Load the overflow_area address into a register. 10271 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 10272 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 10273 .addOperand(Base) 10274 .addOperand(Scale) 10275 .addOperand(Index) 10276 .addDisp(Disp, 8) 10277 .addOperand(Segment) 10278 .setMemRefs(MMOBegin, MMOEnd); 10279 10280 // If we need to align it, do so. Otherwise, just copy the address 10281 // to OverflowDestReg. 10282 if (NeedsAlign) { 10283 // Align the overflow address 10284 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 10285 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 10286 10287 // aligned_addr = (addr + (align-1)) & ~(align-1) 10288 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 10289 .addReg(OverflowAddrReg) 10290 .addImm(Align-1); 10291 10292 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 10293 .addReg(TmpReg) 10294 .addImm(~(uint64_t)(Align-1)); 10295 } else { 10296 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 10297 .addReg(OverflowAddrReg); 10298 } 10299 10300 // Compute the next overflow address after this argument. 10301 // (the overflow address should be kept 8-byte aligned) 10302 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 10303 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 10304 .addReg(OverflowDestReg) 10305 .addImm(ArgSizeA8); 10306 10307 // Store the new overflow address. 10308 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 10309 .addOperand(Base) 10310 .addOperand(Scale) 10311 .addOperand(Index) 10312 .addDisp(Disp, 8) 10313 .addOperand(Segment) 10314 .addReg(NextAddrReg) 10315 .setMemRefs(MMOBegin, MMOEnd); 10316 10317 // If we branched, emit the PHI to the front of endMBB. 10318 if (offsetMBB) { 10319 BuildMI(*endMBB, endMBB->begin(), DL, 10320 TII->get(X86::PHI), DestReg) 10321 .addReg(OffsetDestReg).addMBB(offsetMBB) 10322 .addReg(OverflowDestReg).addMBB(overflowMBB); 10323 } 10324 10325 // Erase the pseudo instruction 10326 MI->eraseFromParent(); 10327 10328 return endMBB; 10329} 10330 10331MachineBasicBlock * 10332X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 10333 MachineInstr *MI, 10334 MachineBasicBlock *MBB) const { 10335 // Emit code to save XMM registers to the stack. The ABI says that the 10336 // number of registers to save is given in %al, so it's theoretically 10337 // possible to do an indirect jump trick to avoid saving all of them, 10338 // however this code takes a simpler approach and just executes all 10339 // of the stores if %al is non-zero. It's less code, and it's probably 10340 // easier on the hardware branch predictor, and stores aren't all that 10341 // expensive anyway. 10342 10343 // Create the new basic blocks. One block contains all the XMM stores, 10344 // and one block is the final destination regardless of whether any 10345 // stores were performed. 10346 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10347 MachineFunction *F = MBB->getParent(); 10348 MachineFunction::iterator MBBIter = MBB; 10349 ++MBBIter; 10350 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 10351 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 10352 F->insert(MBBIter, XMMSaveMBB); 10353 F->insert(MBBIter, EndMBB); 10354 10355 // Transfer the remainder of MBB and its successor edges to EndMBB. 10356 EndMBB->splice(EndMBB->begin(), MBB, 10357 llvm::next(MachineBasicBlock::iterator(MI)), 10358 MBB->end()); 10359 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 10360 10361 // The original block will now fall through to the XMM save block. 10362 MBB->addSuccessor(XMMSaveMBB); 10363 // The XMMSaveMBB will fall through to the end block. 10364 XMMSaveMBB->addSuccessor(EndMBB); 10365 10366 // Now add the instructions. 10367 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10368 DebugLoc DL = MI->getDebugLoc(); 10369 10370 unsigned CountReg = MI->getOperand(0).getReg(); 10371 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 10372 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 10373 10374 if (!Subtarget->isTargetWin64()) { 10375 // If %al is 0, branch around the XMM save block. 10376 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 10377 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 10378 MBB->addSuccessor(EndMBB); 10379 } 10380 10381 // In the XMM save block, save all the XMM argument registers. 10382 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 10383 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 10384 MachineMemOperand *MMO = 10385 F->getMachineMemOperand( 10386 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 10387 MachineMemOperand::MOStore, 10388 /*Size=*/16, /*Align=*/16); 10389 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 10390 .addFrameIndex(RegSaveFrameIndex) 10391 .addImm(/*Scale=*/1) 10392 .addReg(/*IndexReg=*/0) 10393 .addImm(/*Disp=*/Offset) 10394 .addReg(/*Segment=*/0) 10395 .addReg(MI->getOperand(i).getReg()) 10396 .addMemOperand(MMO); 10397 } 10398 10399 MI->eraseFromParent(); // The pseudo instruction is gone now. 10400 10401 return EndMBB; 10402} 10403 10404MachineBasicBlock * 10405X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10406 MachineBasicBlock *BB) const { 10407 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10408 DebugLoc DL = MI->getDebugLoc(); 10409 10410 // To "insert" a SELECT_CC instruction, we actually have to insert the 10411 // diamond control-flow pattern. The incoming instruction knows the 10412 // destination vreg to set, the condition code register to branch on, the 10413 // true/false values to select between, and a branch opcode to use. 10414 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10415 MachineFunction::iterator It = BB; 10416 ++It; 10417 10418 // thisMBB: 10419 // ... 10420 // TrueVal = ... 10421 // cmpTY ccX, r1, r2 10422 // bCC copy1MBB 10423 // fallthrough --> copy0MBB 10424 MachineBasicBlock *thisMBB = BB; 10425 MachineFunction *F = BB->getParent(); 10426 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10427 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10428 F->insert(It, copy0MBB); 10429 F->insert(It, sinkMBB); 10430 10431 // If the EFLAGS register isn't dead in the terminator, then claim that it's 10432 // live into the sink and copy blocks. 10433 const MachineFunction *MF = BB->getParent(); 10434 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 10435 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 10436 10437 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 10438 const MachineOperand &MO = MI->getOperand(I); 10439 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 10440 unsigned Reg = MO.getReg(); 10441 if (Reg != X86::EFLAGS) continue; 10442 copy0MBB->addLiveIn(Reg); 10443 sinkMBB->addLiveIn(Reg); 10444 } 10445 10446 // Transfer the remainder of BB and its successor edges to sinkMBB. 10447 sinkMBB->splice(sinkMBB->begin(), BB, 10448 llvm::next(MachineBasicBlock::iterator(MI)), 10449 BB->end()); 10450 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10451 10452 // Add the true and fallthrough blocks as its successors. 10453 BB->addSuccessor(copy0MBB); 10454 BB->addSuccessor(sinkMBB); 10455 10456 // Create the conditional branch instruction. 10457 unsigned Opc = 10458 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 10459 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 10460 10461 // copy0MBB: 10462 // %FalseValue = ... 10463 // # fallthrough to sinkMBB 10464 copy0MBB->addSuccessor(sinkMBB); 10465 10466 // sinkMBB: 10467 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10468 // ... 10469 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10470 TII->get(X86::PHI), MI->getOperand(0).getReg()) 10471 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 10472 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 10473 10474 MI->eraseFromParent(); // The pseudo instruction is gone now. 10475 return sinkMBB; 10476} 10477 10478MachineBasicBlock * 10479X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10480 MachineBasicBlock *BB) const { 10481 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10482 DebugLoc DL = MI->getDebugLoc(); 10483 10484 assert(!Subtarget->isTargetEnvMacho()); 10485 10486 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10487 // non-trivial part is impdef of ESP. 10488 10489 if (Subtarget->isTargetWin64()) { 10490 if (Subtarget->isTargetCygMing()) { 10491 // ___chkstk(Mingw64): 10492 // Clobbers R10, R11, RAX and EFLAGS. 10493 // Updates RSP. 10494 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10495 .addExternalSymbol("___chkstk") 10496 .addReg(X86::RAX, RegState::Implicit) 10497 .addReg(X86::RSP, RegState::Implicit) 10498 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 10499 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 10500 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10501 } else { 10502 // __chkstk(MSVCRT): does not update stack pointer. 10503 // Clobbers R10, R11 and EFLAGS. 10504 // FIXME: RAX(allocated size) might be reused and not killed. 10505 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10506 .addExternalSymbol("__chkstk") 10507 .addReg(X86::RAX, RegState::Implicit) 10508 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10509 // RAX has the offset to subtracted from RSP. 10510 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 10511 .addReg(X86::RSP) 10512 .addReg(X86::RAX); 10513 } 10514 } else { 10515 const char *StackProbeSymbol = 10516 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10517 10518 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10519 .addExternalSymbol(StackProbeSymbol) 10520 .addReg(X86::EAX, RegState::Implicit) 10521 .addReg(X86::ESP, RegState::Implicit) 10522 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10523 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10524 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10525 } 10526 10527 MI->eraseFromParent(); // The pseudo instruction is gone now. 10528 return BB; 10529} 10530 10531MachineBasicBlock * 10532X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10533 MachineBasicBlock *BB) const { 10534 // This is pretty easy. We're taking the value that we received from 10535 // our load from the relocation, sticking it in either RDI (x86-64) 10536 // or EAX and doing an indirect call. The return value will then 10537 // be in the normal return register. 10538 const X86InstrInfo *TII 10539 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10540 DebugLoc DL = MI->getDebugLoc(); 10541 MachineFunction *F = BB->getParent(); 10542 10543 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10544 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10545 10546 if (Subtarget->is64Bit()) { 10547 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10548 TII->get(X86::MOV64rm), X86::RDI) 10549 .addReg(X86::RIP) 10550 .addImm(0).addReg(0) 10551 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10552 MI->getOperand(3).getTargetFlags()) 10553 .addReg(0); 10554 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10555 addDirectMem(MIB, X86::RDI); 10556 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10557 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10558 TII->get(X86::MOV32rm), X86::EAX) 10559 .addReg(0) 10560 .addImm(0).addReg(0) 10561 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10562 MI->getOperand(3).getTargetFlags()) 10563 .addReg(0); 10564 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10565 addDirectMem(MIB, X86::EAX); 10566 } else { 10567 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10568 TII->get(X86::MOV32rm), X86::EAX) 10569 .addReg(TII->getGlobalBaseReg(F)) 10570 .addImm(0).addReg(0) 10571 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10572 MI->getOperand(3).getTargetFlags()) 10573 .addReg(0); 10574 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10575 addDirectMem(MIB, X86::EAX); 10576 } 10577 10578 MI->eraseFromParent(); // The pseudo instruction is gone now. 10579 return BB; 10580} 10581 10582MachineBasicBlock * 10583X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10584 MachineBasicBlock *BB) const { 10585 switch (MI->getOpcode()) { 10586 default: assert(false && "Unexpected instr type to insert"); 10587 case X86::TAILJMPd64: 10588 case X86::TAILJMPr64: 10589 case X86::TAILJMPm64: 10590 assert(!"TAILJMP64 would not be touched here."); 10591 case X86::TCRETURNdi64: 10592 case X86::TCRETURNri64: 10593 case X86::TCRETURNmi64: 10594 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 10595 // On AMD64, additional defs should be added before register allocation. 10596 if (!Subtarget->isTargetWin64()) { 10597 MI->addRegisterDefined(X86::RSI); 10598 MI->addRegisterDefined(X86::RDI); 10599 MI->addRegisterDefined(X86::XMM6); 10600 MI->addRegisterDefined(X86::XMM7); 10601 MI->addRegisterDefined(X86::XMM8); 10602 MI->addRegisterDefined(X86::XMM9); 10603 MI->addRegisterDefined(X86::XMM10); 10604 MI->addRegisterDefined(X86::XMM11); 10605 MI->addRegisterDefined(X86::XMM12); 10606 MI->addRegisterDefined(X86::XMM13); 10607 MI->addRegisterDefined(X86::XMM14); 10608 MI->addRegisterDefined(X86::XMM15); 10609 } 10610 return BB; 10611 case X86::WIN_ALLOCA: 10612 return EmitLoweredWinAlloca(MI, BB); 10613 case X86::TLSCall_32: 10614 case X86::TLSCall_64: 10615 return EmitLoweredTLSCall(MI, BB); 10616 case X86::CMOV_GR8: 10617 case X86::CMOV_FR32: 10618 case X86::CMOV_FR64: 10619 case X86::CMOV_V4F32: 10620 case X86::CMOV_V2F64: 10621 case X86::CMOV_V2I64: 10622 case X86::CMOV_GR16: 10623 case X86::CMOV_GR32: 10624 case X86::CMOV_RFP32: 10625 case X86::CMOV_RFP64: 10626 case X86::CMOV_RFP80: 10627 return EmitLoweredSelect(MI, BB); 10628 10629 case X86::FP32_TO_INT16_IN_MEM: 10630 case X86::FP32_TO_INT32_IN_MEM: 10631 case X86::FP32_TO_INT64_IN_MEM: 10632 case X86::FP64_TO_INT16_IN_MEM: 10633 case X86::FP64_TO_INT32_IN_MEM: 10634 case X86::FP64_TO_INT64_IN_MEM: 10635 case X86::FP80_TO_INT16_IN_MEM: 10636 case X86::FP80_TO_INT32_IN_MEM: 10637 case X86::FP80_TO_INT64_IN_MEM: { 10638 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10639 DebugLoc DL = MI->getDebugLoc(); 10640 10641 // Change the floating point control register to use "round towards zero" 10642 // mode when truncating to an integer value. 10643 MachineFunction *F = BB->getParent(); 10644 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10645 addFrameReference(BuildMI(*BB, MI, DL, 10646 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10647 10648 // Load the old value of the high byte of the control word... 10649 unsigned OldCW = 10650 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10651 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10652 CWFrameIdx); 10653 10654 // Set the high part to be round to zero... 10655 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10656 .addImm(0xC7F); 10657 10658 // Reload the modified control word now... 10659 addFrameReference(BuildMI(*BB, MI, DL, 10660 TII->get(X86::FLDCW16m)), CWFrameIdx); 10661 10662 // Restore the memory image of control word to original value 10663 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10664 .addReg(OldCW); 10665 10666 // Get the X86 opcode to use. 10667 unsigned Opc; 10668 switch (MI->getOpcode()) { 10669 default: llvm_unreachable("illegal opcode!"); 10670 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10671 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10672 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10673 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10674 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10675 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10676 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10677 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10678 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10679 } 10680 10681 X86AddressMode AM; 10682 MachineOperand &Op = MI->getOperand(0); 10683 if (Op.isReg()) { 10684 AM.BaseType = X86AddressMode::RegBase; 10685 AM.Base.Reg = Op.getReg(); 10686 } else { 10687 AM.BaseType = X86AddressMode::FrameIndexBase; 10688 AM.Base.FrameIndex = Op.getIndex(); 10689 } 10690 Op = MI->getOperand(1); 10691 if (Op.isImm()) 10692 AM.Scale = Op.getImm(); 10693 Op = MI->getOperand(2); 10694 if (Op.isImm()) 10695 AM.IndexReg = Op.getImm(); 10696 Op = MI->getOperand(3); 10697 if (Op.isGlobal()) { 10698 AM.GV = Op.getGlobal(); 10699 } else { 10700 AM.Disp = Op.getImm(); 10701 } 10702 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10703 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10704 10705 // Reload the original control word now. 10706 addFrameReference(BuildMI(*BB, MI, DL, 10707 TII->get(X86::FLDCW16m)), CWFrameIdx); 10708 10709 MI->eraseFromParent(); // The pseudo instruction is gone now. 10710 return BB; 10711 } 10712 // String/text processing lowering. 10713 case X86::PCMPISTRM128REG: 10714 case X86::VPCMPISTRM128REG: 10715 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10716 case X86::PCMPISTRM128MEM: 10717 case X86::VPCMPISTRM128MEM: 10718 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10719 case X86::PCMPESTRM128REG: 10720 case X86::VPCMPESTRM128REG: 10721 return EmitPCMP(MI, BB, 5, false /* in mem */); 10722 case X86::PCMPESTRM128MEM: 10723 case X86::VPCMPESTRM128MEM: 10724 return EmitPCMP(MI, BB, 5, true /* in mem */); 10725 10726 // Thread synchronization. 10727 case X86::MONITOR: 10728 return EmitMonitor(MI, BB); 10729 case X86::MWAIT: 10730 return EmitMwait(MI, BB); 10731 10732 // Atomic Lowering. 10733 case X86::ATOMAND32: 10734 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10735 X86::AND32ri, X86::MOV32rm, 10736 X86::LCMPXCHG32, 10737 X86::NOT32r, X86::EAX, 10738 X86::GR32RegisterClass); 10739 case X86::ATOMOR32: 10740 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10741 X86::OR32ri, X86::MOV32rm, 10742 X86::LCMPXCHG32, 10743 X86::NOT32r, X86::EAX, 10744 X86::GR32RegisterClass); 10745 case X86::ATOMXOR32: 10746 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10747 X86::XOR32ri, X86::MOV32rm, 10748 X86::LCMPXCHG32, 10749 X86::NOT32r, X86::EAX, 10750 X86::GR32RegisterClass); 10751 case X86::ATOMNAND32: 10752 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10753 X86::AND32ri, X86::MOV32rm, 10754 X86::LCMPXCHG32, 10755 X86::NOT32r, X86::EAX, 10756 X86::GR32RegisterClass, true); 10757 case X86::ATOMMIN32: 10758 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10759 case X86::ATOMMAX32: 10760 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10761 case X86::ATOMUMIN32: 10762 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10763 case X86::ATOMUMAX32: 10764 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10765 10766 case X86::ATOMAND16: 10767 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10768 X86::AND16ri, X86::MOV16rm, 10769 X86::LCMPXCHG16, 10770 X86::NOT16r, X86::AX, 10771 X86::GR16RegisterClass); 10772 case X86::ATOMOR16: 10773 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10774 X86::OR16ri, X86::MOV16rm, 10775 X86::LCMPXCHG16, 10776 X86::NOT16r, X86::AX, 10777 X86::GR16RegisterClass); 10778 case X86::ATOMXOR16: 10779 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10780 X86::XOR16ri, X86::MOV16rm, 10781 X86::LCMPXCHG16, 10782 X86::NOT16r, X86::AX, 10783 X86::GR16RegisterClass); 10784 case X86::ATOMNAND16: 10785 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10786 X86::AND16ri, X86::MOV16rm, 10787 X86::LCMPXCHG16, 10788 X86::NOT16r, X86::AX, 10789 X86::GR16RegisterClass, true); 10790 case X86::ATOMMIN16: 10791 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10792 case X86::ATOMMAX16: 10793 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10794 case X86::ATOMUMIN16: 10795 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10796 case X86::ATOMUMAX16: 10797 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10798 10799 case X86::ATOMAND8: 10800 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10801 X86::AND8ri, X86::MOV8rm, 10802 X86::LCMPXCHG8, 10803 X86::NOT8r, X86::AL, 10804 X86::GR8RegisterClass); 10805 case X86::ATOMOR8: 10806 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10807 X86::OR8ri, X86::MOV8rm, 10808 X86::LCMPXCHG8, 10809 X86::NOT8r, X86::AL, 10810 X86::GR8RegisterClass); 10811 case X86::ATOMXOR8: 10812 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10813 X86::XOR8ri, X86::MOV8rm, 10814 X86::LCMPXCHG8, 10815 X86::NOT8r, X86::AL, 10816 X86::GR8RegisterClass); 10817 case X86::ATOMNAND8: 10818 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10819 X86::AND8ri, X86::MOV8rm, 10820 X86::LCMPXCHG8, 10821 X86::NOT8r, X86::AL, 10822 X86::GR8RegisterClass, true); 10823 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10824 // This group is for 64-bit host. 10825 case X86::ATOMAND64: 10826 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10827 X86::AND64ri32, X86::MOV64rm, 10828 X86::LCMPXCHG64, 10829 X86::NOT64r, X86::RAX, 10830 X86::GR64RegisterClass); 10831 case X86::ATOMOR64: 10832 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10833 X86::OR64ri32, X86::MOV64rm, 10834 X86::LCMPXCHG64, 10835 X86::NOT64r, X86::RAX, 10836 X86::GR64RegisterClass); 10837 case X86::ATOMXOR64: 10838 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10839 X86::XOR64ri32, X86::MOV64rm, 10840 X86::LCMPXCHG64, 10841 X86::NOT64r, X86::RAX, 10842 X86::GR64RegisterClass); 10843 case X86::ATOMNAND64: 10844 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10845 X86::AND64ri32, X86::MOV64rm, 10846 X86::LCMPXCHG64, 10847 X86::NOT64r, X86::RAX, 10848 X86::GR64RegisterClass, true); 10849 case X86::ATOMMIN64: 10850 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10851 case X86::ATOMMAX64: 10852 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10853 case X86::ATOMUMIN64: 10854 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10855 case X86::ATOMUMAX64: 10856 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10857 10858 // This group does 64-bit operations on a 32-bit host. 10859 case X86::ATOMAND6432: 10860 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10861 X86::AND32rr, X86::AND32rr, 10862 X86::AND32ri, X86::AND32ri, 10863 false); 10864 case X86::ATOMOR6432: 10865 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10866 X86::OR32rr, X86::OR32rr, 10867 X86::OR32ri, X86::OR32ri, 10868 false); 10869 case X86::ATOMXOR6432: 10870 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10871 X86::XOR32rr, X86::XOR32rr, 10872 X86::XOR32ri, X86::XOR32ri, 10873 false); 10874 case X86::ATOMNAND6432: 10875 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10876 X86::AND32rr, X86::AND32rr, 10877 X86::AND32ri, X86::AND32ri, 10878 true); 10879 case X86::ATOMADD6432: 10880 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10881 X86::ADD32rr, X86::ADC32rr, 10882 X86::ADD32ri, X86::ADC32ri, 10883 false); 10884 case X86::ATOMSUB6432: 10885 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10886 X86::SUB32rr, X86::SBB32rr, 10887 X86::SUB32ri, X86::SBB32ri, 10888 false); 10889 case X86::ATOMSWAP6432: 10890 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10891 X86::MOV32rr, X86::MOV32rr, 10892 X86::MOV32ri, X86::MOV32ri, 10893 false); 10894 case X86::VASTART_SAVE_XMM_REGS: 10895 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10896 10897 case X86::VAARG_64: 10898 return EmitVAARG64WithCustomInserter(MI, BB); 10899 } 10900} 10901 10902//===----------------------------------------------------------------------===// 10903// X86 Optimization Hooks 10904//===----------------------------------------------------------------------===// 10905 10906void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10907 const APInt &Mask, 10908 APInt &KnownZero, 10909 APInt &KnownOne, 10910 const SelectionDAG &DAG, 10911 unsigned Depth) const { 10912 unsigned Opc = Op.getOpcode(); 10913 assert((Opc >= ISD::BUILTIN_OP_END || 10914 Opc == ISD::INTRINSIC_WO_CHAIN || 10915 Opc == ISD::INTRINSIC_W_CHAIN || 10916 Opc == ISD::INTRINSIC_VOID) && 10917 "Should use MaskedValueIsZero if you don't know whether Op" 10918 " is a target node!"); 10919 10920 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10921 switch (Opc) { 10922 default: break; 10923 case X86ISD::ADD: 10924 case X86ISD::SUB: 10925 case X86ISD::ADC: 10926 case X86ISD::SBB: 10927 case X86ISD::SMUL: 10928 case X86ISD::UMUL: 10929 case X86ISD::INC: 10930 case X86ISD::DEC: 10931 case X86ISD::OR: 10932 case X86ISD::XOR: 10933 case X86ISD::AND: 10934 // These nodes' second result is a boolean. 10935 if (Op.getResNo() == 0) 10936 break; 10937 // Fallthrough 10938 case X86ISD::SETCC: 10939 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10940 Mask.getBitWidth() - 1); 10941 break; 10942 } 10943} 10944 10945unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10946 unsigned Depth) const { 10947 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10948 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10949 return Op.getValueType().getScalarType().getSizeInBits(); 10950 10951 // Fallback case. 10952 return 1; 10953} 10954 10955/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10956/// node is a GlobalAddress + offset. 10957bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10958 const GlobalValue* &GA, 10959 int64_t &Offset) const { 10960 if (N->getOpcode() == X86ISD::Wrapper) { 10961 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10962 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10963 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10964 return true; 10965 } 10966 } 10967 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10968} 10969 10970/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10971/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10972/// if the load addresses are consecutive, non-overlapping, and in the right 10973/// order. 10974static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10975 TargetLowering::DAGCombinerInfo &DCI) { 10976 DebugLoc dl = N->getDebugLoc(); 10977 EVT VT = N->getValueType(0); 10978 10979 if (VT.getSizeInBits() != 128) 10980 return SDValue(); 10981 10982 // Don't create instructions with illegal types after legalize types has run. 10983 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10984 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 10985 return SDValue(); 10986 10987 SmallVector<SDValue, 16> Elts; 10988 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10989 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10990 10991 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10992} 10993 10994/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10995/// generation and convert it from being a bunch of shuffles and extracts 10996/// to a simple store and scalar loads to extract the elements. 10997static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10998 const TargetLowering &TLI) { 10999 SDValue InputVector = N->getOperand(0); 11000 11001 // Only operate on vectors of 4 elements, where the alternative shuffling 11002 // gets to be more expensive. 11003 if (InputVector.getValueType() != MVT::v4i32) 11004 return SDValue(); 11005 11006 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 11007 // single use which is a sign-extend or zero-extend, and all elements are 11008 // used. 11009 SmallVector<SDNode *, 4> Uses; 11010 unsigned ExtractedElements = 0; 11011 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 11012 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 11013 if (UI.getUse().getResNo() != InputVector.getResNo()) 11014 return SDValue(); 11015 11016 SDNode *Extract = *UI; 11017 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11018 return SDValue(); 11019 11020 if (Extract->getValueType(0) != MVT::i32) 11021 return SDValue(); 11022 if (!Extract->hasOneUse()) 11023 return SDValue(); 11024 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 11025 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 11026 return SDValue(); 11027 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 11028 return SDValue(); 11029 11030 // Record which element was extracted. 11031 ExtractedElements |= 11032 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 11033 11034 Uses.push_back(Extract); 11035 } 11036 11037 // If not all the elements were used, this may not be worthwhile. 11038 if (ExtractedElements != 15) 11039 return SDValue(); 11040 11041 // Ok, we've now decided to do the transformation. 11042 DebugLoc dl = InputVector.getDebugLoc(); 11043 11044 // Store the value to a temporary stack slot. 11045 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 11046 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 11047 MachinePointerInfo(), false, false, 0); 11048 11049 // Replace each use (extract) with a load of the appropriate element. 11050 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 11051 UE = Uses.end(); UI != UE; ++UI) { 11052 SDNode *Extract = *UI; 11053 11054 // cOMpute the element's address. 11055 SDValue Idx = Extract->getOperand(1); 11056 unsigned EltSize = 11057 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 11058 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 11059 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 11060 11061 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 11062 StackPtr, OffsetVal); 11063 11064 // Load the scalar. 11065 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 11066 ScalarAddr, MachinePointerInfo(), 11067 false, false, 0); 11068 11069 // Replace the exact with the load. 11070 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 11071 } 11072 11073 // The replacement was made in place; don't return anything. 11074 return SDValue(); 11075} 11076 11077/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 11078static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 11079 const X86Subtarget *Subtarget) { 11080 DebugLoc DL = N->getDebugLoc(); 11081 SDValue Cond = N->getOperand(0); 11082 // Get the LHS/RHS of the select. 11083 SDValue LHS = N->getOperand(1); 11084 SDValue RHS = N->getOperand(2); 11085 11086 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 11087 // instructions match the semantics of the common C idiom x<y?x:y but not 11088 // x<=y?x:y, because of how they handle negative zero (which can be 11089 // ignored in unsafe-math mode). 11090 if (Subtarget->hasSSE2() && 11091 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 11092 Cond.getOpcode() == ISD::SETCC) { 11093 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 11094 11095 unsigned Opcode = 0; 11096 // Check for x CC y ? x : y. 11097 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 11098 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 11099 switch (CC) { 11100 default: break; 11101 case ISD::SETULT: 11102 // Converting this to a min would handle NaNs incorrectly, and swapping 11103 // the operands would cause it to handle comparisons between positive 11104 // and negative zero incorrectly. 11105 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11106 if (!UnsafeFPMath && 11107 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11108 break; 11109 std::swap(LHS, RHS); 11110 } 11111 Opcode = X86ISD::FMIN; 11112 break; 11113 case ISD::SETOLE: 11114 // Converting this to a min would handle comparisons between positive 11115 // and negative zero incorrectly. 11116 if (!UnsafeFPMath && 11117 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11118 break; 11119 Opcode = X86ISD::FMIN; 11120 break; 11121 case ISD::SETULE: 11122 // Converting this to a min would handle both negative zeros and NaNs 11123 // incorrectly, but we can swap the operands to fix both. 11124 std::swap(LHS, RHS); 11125 case ISD::SETOLT: 11126 case ISD::SETLT: 11127 case ISD::SETLE: 11128 Opcode = X86ISD::FMIN; 11129 break; 11130 11131 case ISD::SETOGE: 11132 // Converting this to a max would handle comparisons between positive 11133 // and negative zero incorrectly. 11134 if (!UnsafeFPMath && 11135 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 11136 break; 11137 Opcode = X86ISD::FMAX; 11138 break; 11139 case ISD::SETUGT: 11140 // Converting this to a max would handle NaNs incorrectly, and swapping 11141 // the operands would cause it to handle comparisons between positive 11142 // and negative zero incorrectly. 11143 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11144 if (!UnsafeFPMath && 11145 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11146 break; 11147 std::swap(LHS, RHS); 11148 } 11149 Opcode = X86ISD::FMAX; 11150 break; 11151 case ISD::SETUGE: 11152 // Converting this to a max would handle both negative zeros and NaNs 11153 // incorrectly, but we can swap the operands to fix both. 11154 std::swap(LHS, RHS); 11155 case ISD::SETOGT: 11156 case ISD::SETGT: 11157 case ISD::SETGE: 11158 Opcode = X86ISD::FMAX; 11159 break; 11160 } 11161 // Check for x CC y ? y : x -- a min/max with reversed arms. 11162 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 11163 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 11164 switch (CC) { 11165 default: break; 11166 case ISD::SETOGE: 11167 // Converting this to a min would handle comparisons between positive 11168 // and negative zero incorrectly, and swapping the operands would 11169 // cause it to handle NaNs incorrectly. 11170 if (!UnsafeFPMath && 11171 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 11172 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11173 break; 11174 std::swap(LHS, RHS); 11175 } 11176 Opcode = X86ISD::FMIN; 11177 break; 11178 case ISD::SETUGT: 11179 // Converting this to a min would handle NaNs incorrectly. 11180 if (!UnsafeFPMath && 11181 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 11182 break; 11183 Opcode = X86ISD::FMIN; 11184 break; 11185 case ISD::SETUGE: 11186 // Converting this to a min would handle both negative zeros and NaNs 11187 // incorrectly, but we can swap the operands to fix both. 11188 std::swap(LHS, RHS); 11189 case ISD::SETOGT: 11190 case ISD::SETGT: 11191 case ISD::SETGE: 11192 Opcode = X86ISD::FMIN; 11193 break; 11194 11195 case ISD::SETULT: 11196 // Converting this to a max would handle NaNs incorrectly. 11197 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11198 break; 11199 Opcode = X86ISD::FMAX; 11200 break; 11201 case ISD::SETOLE: 11202 // Converting this to a max would handle comparisons between positive 11203 // and negative zero incorrectly, and swapping the operands would 11204 // cause it to handle NaNs incorrectly. 11205 if (!UnsafeFPMath && 11206 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 11207 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11208 break; 11209 std::swap(LHS, RHS); 11210 } 11211 Opcode = X86ISD::FMAX; 11212 break; 11213 case ISD::SETULE: 11214 // Converting this to a max would handle both negative zeros and NaNs 11215 // incorrectly, but we can swap the operands to fix both. 11216 std::swap(LHS, RHS); 11217 case ISD::SETOLT: 11218 case ISD::SETLT: 11219 case ISD::SETLE: 11220 Opcode = X86ISD::FMAX; 11221 break; 11222 } 11223 } 11224 11225 if (Opcode) 11226 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 11227 } 11228 11229 // If this is a select between two integer constants, try to do some 11230 // optimizations. 11231 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 11232 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 11233 // Don't do this for crazy integer types. 11234 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 11235 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 11236 // so that TrueC (the true value) is larger than FalseC. 11237 bool NeedsCondInvert = false; 11238 11239 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 11240 // Efficiently invertible. 11241 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 11242 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 11243 isa<ConstantSDNode>(Cond.getOperand(1))))) { 11244 NeedsCondInvert = true; 11245 std::swap(TrueC, FalseC); 11246 } 11247 11248 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 11249 if (FalseC->getAPIntValue() == 0 && 11250 TrueC->getAPIntValue().isPowerOf2()) { 11251 if (NeedsCondInvert) // Invert the condition if needed. 11252 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11253 DAG.getConstant(1, Cond.getValueType())); 11254 11255 // Zero extend the condition if needed. 11256 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 11257 11258 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11259 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 11260 DAG.getConstant(ShAmt, MVT::i8)); 11261 } 11262 11263 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 11264 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11265 if (NeedsCondInvert) // Invert the condition if needed. 11266 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11267 DAG.getConstant(1, Cond.getValueType())); 11268 11269 // Zero extend the condition if needed. 11270 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11271 FalseC->getValueType(0), Cond); 11272 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11273 SDValue(FalseC, 0)); 11274 } 11275 11276 // Optimize cases that will turn into an LEA instruction. This requires 11277 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11278 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11279 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11280 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11281 11282 bool isFastMultiplier = false; 11283 if (Diff < 10) { 11284 switch ((unsigned char)Diff) { 11285 default: break; 11286 case 1: // result = add base, cond 11287 case 2: // result = lea base( , cond*2) 11288 case 3: // result = lea base(cond, cond*2) 11289 case 4: // result = lea base( , cond*4) 11290 case 5: // result = lea base(cond, cond*4) 11291 case 8: // result = lea base( , cond*8) 11292 case 9: // result = lea base(cond, cond*8) 11293 isFastMultiplier = true; 11294 break; 11295 } 11296 } 11297 11298 if (isFastMultiplier) { 11299 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11300 if (NeedsCondInvert) // Invert the condition if needed. 11301 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11302 DAG.getConstant(1, Cond.getValueType())); 11303 11304 // Zero extend the condition if needed. 11305 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11306 Cond); 11307 // Scale the condition by the difference. 11308 if (Diff != 1) 11309 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11310 DAG.getConstant(Diff, Cond.getValueType())); 11311 11312 // Add the base if non-zero. 11313 if (FalseC->getAPIntValue() != 0) 11314 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11315 SDValue(FalseC, 0)); 11316 return Cond; 11317 } 11318 } 11319 } 11320 } 11321 11322 return SDValue(); 11323} 11324 11325/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 11326static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 11327 TargetLowering::DAGCombinerInfo &DCI) { 11328 DebugLoc DL = N->getDebugLoc(); 11329 11330 // If the flag operand isn't dead, don't touch this CMOV. 11331 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 11332 return SDValue(); 11333 11334 // If this is a select between two integer constants, try to do some 11335 // optimizations. Note that the operands are ordered the opposite of SELECT 11336 // operands. 11337 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 11338 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 11339 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 11340 // larger than FalseC (the false value). 11341 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 11342 11343 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 11344 CC = X86::GetOppositeBranchCondition(CC); 11345 std::swap(TrueC, FalseC); 11346 } 11347 11348 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 11349 // This is efficient for any integer data type (including i8/i16) and 11350 // shift amount. 11351 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 11352 SDValue Cond = N->getOperand(3); 11353 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11354 DAG.getConstant(CC, MVT::i8), Cond); 11355 11356 // Zero extend the condition if needed. 11357 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 11358 11359 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11360 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 11361 DAG.getConstant(ShAmt, MVT::i8)); 11362 if (N->getNumValues() == 2) // Dead flag value? 11363 return DCI.CombineTo(N, Cond, SDValue()); 11364 return Cond; 11365 } 11366 11367 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 11368 // for any integer data type, including i8/i16. 11369 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11370 SDValue Cond = N->getOperand(3); 11371 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11372 DAG.getConstant(CC, MVT::i8), Cond); 11373 11374 // Zero extend the condition if needed. 11375 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11376 FalseC->getValueType(0), Cond); 11377 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11378 SDValue(FalseC, 0)); 11379 11380 if (N->getNumValues() == 2) // Dead flag value? 11381 return DCI.CombineTo(N, Cond, SDValue()); 11382 return Cond; 11383 } 11384 11385 // Optimize cases that will turn into an LEA instruction. This requires 11386 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11387 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11388 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11389 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11390 11391 bool isFastMultiplier = false; 11392 if (Diff < 10) { 11393 switch ((unsigned char)Diff) { 11394 default: break; 11395 case 1: // result = add base, cond 11396 case 2: // result = lea base( , cond*2) 11397 case 3: // result = lea base(cond, cond*2) 11398 case 4: // result = lea base( , cond*4) 11399 case 5: // result = lea base(cond, cond*4) 11400 case 8: // result = lea base( , cond*8) 11401 case 9: // result = lea base(cond, cond*8) 11402 isFastMultiplier = true; 11403 break; 11404 } 11405 } 11406 11407 if (isFastMultiplier) { 11408 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11409 SDValue Cond = N->getOperand(3); 11410 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11411 DAG.getConstant(CC, MVT::i8), Cond); 11412 // Zero extend the condition if needed. 11413 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11414 Cond); 11415 // Scale the condition by the difference. 11416 if (Diff != 1) 11417 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11418 DAG.getConstant(Diff, Cond.getValueType())); 11419 11420 // Add the base if non-zero. 11421 if (FalseC->getAPIntValue() != 0) 11422 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11423 SDValue(FalseC, 0)); 11424 if (N->getNumValues() == 2) // Dead flag value? 11425 return DCI.CombineTo(N, Cond, SDValue()); 11426 return Cond; 11427 } 11428 } 11429 } 11430 } 11431 return SDValue(); 11432} 11433 11434 11435/// PerformMulCombine - Optimize a single multiply with constant into two 11436/// in order to implement it with two cheaper instructions, e.g. 11437/// LEA + SHL, LEA + LEA. 11438static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 11439 TargetLowering::DAGCombinerInfo &DCI) { 11440 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11441 return SDValue(); 11442 11443 EVT VT = N->getValueType(0); 11444 if (VT != MVT::i64) 11445 return SDValue(); 11446 11447 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11448 if (!C) 11449 return SDValue(); 11450 uint64_t MulAmt = C->getZExtValue(); 11451 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 11452 return SDValue(); 11453 11454 uint64_t MulAmt1 = 0; 11455 uint64_t MulAmt2 = 0; 11456 if ((MulAmt % 9) == 0) { 11457 MulAmt1 = 9; 11458 MulAmt2 = MulAmt / 9; 11459 } else if ((MulAmt % 5) == 0) { 11460 MulAmt1 = 5; 11461 MulAmt2 = MulAmt / 5; 11462 } else if ((MulAmt % 3) == 0) { 11463 MulAmt1 = 3; 11464 MulAmt2 = MulAmt / 3; 11465 } 11466 if (MulAmt2 && 11467 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 11468 DebugLoc DL = N->getDebugLoc(); 11469 11470 if (isPowerOf2_64(MulAmt2) && 11471 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 11472 // If second multiplifer is pow2, issue it first. We want the multiply by 11473 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 11474 // is an add. 11475 std::swap(MulAmt1, MulAmt2); 11476 11477 SDValue NewMul; 11478 if (isPowerOf2_64(MulAmt1)) 11479 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 11480 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 11481 else 11482 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 11483 DAG.getConstant(MulAmt1, VT)); 11484 11485 if (isPowerOf2_64(MulAmt2)) 11486 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 11487 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 11488 else 11489 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 11490 DAG.getConstant(MulAmt2, VT)); 11491 11492 // Do not add new nodes to DAG combiner worklist. 11493 DCI.CombineTo(N, NewMul, false); 11494 } 11495 return SDValue(); 11496} 11497 11498static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 11499 SDValue N0 = N->getOperand(0); 11500 SDValue N1 = N->getOperand(1); 11501 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11502 EVT VT = N0.getValueType(); 11503 11504 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11505 // since the result of setcc_c is all zero's or all ones. 11506 if (N1C && N0.getOpcode() == ISD::AND && 11507 N0.getOperand(1).getOpcode() == ISD::Constant) { 11508 SDValue N00 = N0.getOperand(0); 11509 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11510 ((N00.getOpcode() == ISD::ANY_EXTEND || 11511 N00.getOpcode() == ISD::ZERO_EXTEND) && 11512 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11513 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11514 APInt ShAmt = N1C->getAPIntValue(); 11515 Mask = Mask.shl(ShAmt); 11516 if (Mask != 0) 11517 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11518 N00, DAG.getConstant(Mask, VT)); 11519 } 11520 } 11521 11522 return SDValue(); 11523} 11524 11525/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 11526/// when possible. 11527static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 11528 const X86Subtarget *Subtarget) { 11529 EVT VT = N->getValueType(0); 11530 if (!VT.isVector() && VT.isInteger() && 11531 N->getOpcode() == ISD::SHL) 11532 return PerformSHLCombine(N, DAG); 11533 11534 // On X86 with SSE2 support, we can transform this to a vector shift if 11535 // all elements are shifted by the same amount. We can't do this in legalize 11536 // because the a constant vector is typically transformed to a constant pool 11537 // so we have no knowledge of the shift amount. 11538 if (!Subtarget->hasSSE2()) 11539 return SDValue(); 11540 11541 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 11542 return SDValue(); 11543 11544 SDValue ShAmtOp = N->getOperand(1); 11545 EVT EltVT = VT.getVectorElementType(); 11546 DebugLoc DL = N->getDebugLoc(); 11547 SDValue BaseShAmt = SDValue(); 11548 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 11549 unsigned NumElts = VT.getVectorNumElements(); 11550 unsigned i = 0; 11551 for (; i != NumElts; ++i) { 11552 SDValue Arg = ShAmtOp.getOperand(i); 11553 if (Arg.getOpcode() == ISD::UNDEF) continue; 11554 BaseShAmt = Arg; 11555 break; 11556 } 11557 for (; i != NumElts; ++i) { 11558 SDValue Arg = ShAmtOp.getOperand(i); 11559 if (Arg.getOpcode() == ISD::UNDEF) continue; 11560 if (Arg != BaseShAmt) { 11561 return SDValue(); 11562 } 11563 } 11564 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 11565 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 11566 SDValue InVec = ShAmtOp.getOperand(0); 11567 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 11568 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 11569 unsigned i = 0; 11570 for (; i != NumElts; ++i) { 11571 SDValue Arg = InVec.getOperand(i); 11572 if (Arg.getOpcode() == ISD::UNDEF) continue; 11573 BaseShAmt = Arg; 11574 break; 11575 } 11576 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 11577 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 11578 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 11579 if (C->getZExtValue() == SplatIdx) 11580 BaseShAmt = InVec.getOperand(1); 11581 } 11582 } 11583 if (BaseShAmt.getNode() == 0) 11584 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 11585 DAG.getIntPtrConstant(0)); 11586 } else 11587 return SDValue(); 11588 11589 // The shift amount is an i32. 11590 if (EltVT.bitsGT(MVT::i32)) 11591 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 11592 else if (EltVT.bitsLT(MVT::i32)) 11593 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 11594 11595 // The shift amount is identical so we can do a vector shift. 11596 SDValue ValOp = N->getOperand(0); 11597 switch (N->getOpcode()) { 11598 default: 11599 llvm_unreachable("Unknown shift opcode!"); 11600 break; 11601 case ISD::SHL: 11602 if (VT == MVT::v2i64) 11603 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11604 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11605 ValOp, BaseShAmt); 11606 if (VT == MVT::v4i32) 11607 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11608 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11609 ValOp, BaseShAmt); 11610 if (VT == MVT::v8i16) 11611 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11612 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11613 ValOp, BaseShAmt); 11614 break; 11615 case ISD::SRA: 11616 if (VT == MVT::v4i32) 11617 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11618 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11619 ValOp, BaseShAmt); 11620 if (VT == MVT::v8i16) 11621 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11622 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11623 ValOp, BaseShAmt); 11624 break; 11625 case ISD::SRL: 11626 if (VT == MVT::v2i64) 11627 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11628 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11629 ValOp, BaseShAmt); 11630 if (VT == MVT::v4i32) 11631 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11632 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11633 ValOp, BaseShAmt); 11634 if (VT == MVT::v8i16) 11635 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11636 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11637 ValOp, BaseShAmt); 11638 break; 11639 } 11640 return SDValue(); 11641} 11642 11643 11644static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 11645 TargetLowering::DAGCombinerInfo &DCI, 11646 const X86Subtarget *Subtarget) { 11647 if (DCI.isBeforeLegalizeOps()) 11648 return SDValue(); 11649 11650 // Want to form PANDN nodes, in the hopes of then easily combining them with 11651 // OR and AND nodes to form PBLEND/PSIGN. 11652 EVT VT = N->getValueType(0); 11653 if (VT != MVT::v2i64) 11654 return SDValue(); 11655 11656 SDValue N0 = N->getOperand(0); 11657 SDValue N1 = N->getOperand(1); 11658 DebugLoc DL = N->getDebugLoc(); 11659 11660 // Check LHS for vnot 11661 if (N0.getOpcode() == ISD::XOR && 11662 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 11663 return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); 11664 11665 // Check RHS for vnot 11666 if (N1.getOpcode() == ISD::XOR && 11667 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 11668 return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); 11669 11670 return SDValue(); 11671} 11672 11673static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 11674 TargetLowering::DAGCombinerInfo &DCI, 11675 const X86Subtarget *Subtarget) { 11676 if (DCI.isBeforeLegalizeOps()) 11677 return SDValue(); 11678 11679 EVT VT = N->getValueType(0); 11680 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 11681 return SDValue(); 11682 11683 SDValue N0 = N->getOperand(0); 11684 SDValue N1 = N->getOperand(1); 11685 11686 // look for psign/blend 11687 if (Subtarget->hasSSSE3()) { 11688 if (VT == MVT::v2i64) { 11689 // Canonicalize pandn to RHS 11690 if (N0.getOpcode() == X86ISD::PANDN) 11691 std::swap(N0, N1); 11692 // or (and (m, x), (pandn m, y)) 11693 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { 11694 SDValue Mask = N1.getOperand(0); 11695 SDValue X = N1.getOperand(1); 11696 SDValue Y; 11697 if (N0.getOperand(0) == Mask) 11698 Y = N0.getOperand(1); 11699 if (N0.getOperand(1) == Mask) 11700 Y = N0.getOperand(0); 11701 11702 // Check to see if the mask appeared in both the AND and PANDN and 11703 if (!Y.getNode()) 11704 return SDValue(); 11705 11706 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 11707 if (Mask.getOpcode() != ISD::BITCAST || 11708 X.getOpcode() != ISD::BITCAST || 11709 Y.getOpcode() != ISD::BITCAST) 11710 return SDValue(); 11711 11712 // Look through mask bitcast. 11713 Mask = Mask.getOperand(0); 11714 EVT MaskVT = Mask.getValueType(); 11715 11716 // Validate that the Mask operand is a vector sra node. The sra node 11717 // will be an intrinsic. 11718 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 11719 return SDValue(); 11720 11721 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 11722 // there is no psrai.b 11723 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 11724 case Intrinsic::x86_sse2_psrai_w: 11725 case Intrinsic::x86_sse2_psrai_d: 11726 break; 11727 default: return SDValue(); 11728 } 11729 11730 // Check that the SRA is all signbits. 11731 SDValue SraC = Mask.getOperand(2); 11732 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 11733 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 11734 if ((SraAmt + 1) != EltBits) 11735 return SDValue(); 11736 11737 DebugLoc DL = N->getDebugLoc(); 11738 11739 // Now we know we at least have a plendvb with the mask val. See if 11740 // we can form a psignb/w/d. 11741 // psign = x.type == y.type == mask.type && y = sub(0, x); 11742 X = X.getOperand(0); 11743 Y = Y.getOperand(0); 11744 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 11745 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 11746 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 11747 unsigned Opc = 0; 11748 switch (EltBits) { 11749 case 8: Opc = X86ISD::PSIGNB; break; 11750 case 16: Opc = X86ISD::PSIGNW; break; 11751 case 32: Opc = X86ISD::PSIGND; break; 11752 default: break; 11753 } 11754 if (Opc) { 11755 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 11756 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 11757 } 11758 } 11759 // PBLENDVB only available on SSE 4.1 11760 if (!Subtarget->hasSSE41()) 11761 return SDValue(); 11762 11763 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 11764 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 11765 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 11766 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 11767 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 11768 } 11769 } 11770 } 11771 11772 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 11773 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 11774 std::swap(N0, N1); 11775 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 11776 return SDValue(); 11777 if (!N0.hasOneUse() || !N1.hasOneUse()) 11778 return SDValue(); 11779 11780 SDValue ShAmt0 = N0.getOperand(1); 11781 if (ShAmt0.getValueType() != MVT::i8) 11782 return SDValue(); 11783 SDValue ShAmt1 = N1.getOperand(1); 11784 if (ShAmt1.getValueType() != MVT::i8) 11785 return SDValue(); 11786 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11787 ShAmt0 = ShAmt0.getOperand(0); 11788 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11789 ShAmt1 = ShAmt1.getOperand(0); 11790 11791 DebugLoc DL = N->getDebugLoc(); 11792 unsigned Opc = X86ISD::SHLD; 11793 SDValue Op0 = N0.getOperand(0); 11794 SDValue Op1 = N1.getOperand(0); 11795 if (ShAmt0.getOpcode() == ISD::SUB) { 11796 Opc = X86ISD::SHRD; 11797 std::swap(Op0, Op1); 11798 std::swap(ShAmt0, ShAmt1); 11799 } 11800 11801 unsigned Bits = VT.getSizeInBits(); 11802 if (ShAmt1.getOpcode() == ISD::SUB) { 11803 SDValue Sum = ShAmt1.getOperand(0); 11804 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11805 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11806 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11807 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11808 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11809 return DAG.getNode(Opc, DL, VT, 11810 Op0, Op1, 11811 DAG.getNode(ISD::TRUNCATE, DL, 11812 MVT::i8, ShAmt0)); 11813 } 11814 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11815 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11816 if (ShAmt0C && 11817 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11818 return DAG.getNode(Opc, DL, VT, 11819 N0.getOperand(0), N1.getOperand(0), 11820 DAG.getNode(ISD::TRUNCATE, DL, 11821 MVT::i8, ShAmt0)); 11822 } 11823 11824 return SDValue(); 11825} 11826 11827/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11828static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11829 const X86Subtarget *Subtarget) { 11830 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11831 // the FP state in cases where an emms may be missing. 11832 // A preferable solution to the general problem is to figure out the right 11833 // places to insert EMMS. This qualifies as a quick hack. 11834 11835 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11836 StoreSDNode *St = cast<StoreSDNode>(N); 11837 EVT VT = St->getValue().getValueType(); 11838 if (VT.getSizeInBits() != 64) 11839 return SDValue(); 11840 11841 const Function *F = DAG.getMachineFunction().getFunction(); 11842 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11843 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11844 && Subtarget->hasSSE2(); 11845 if ((VT.isVector() || 11846 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11847 isa<LoadSDNode>(St->getValue()) && 11848 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11849 St->getChain().hasOneUse() && !St->isVolatile()) { 11850 SDNode* LdVal = St->getValue().getNode(); 11851 LoadSDNode *Ld = 0; 11852 int TokenFactorIndex = -1; 11853 SmallVector<SDValue, 8> Ops; 11854 SDNode* ChainVal = St->getChain().getNode(); 11855 // Must be a store of a load. We currently handle two cases: the load 11856 // is a direct child, and it's under an intervening TokenFactor. It is 11857 // possible to dig deeper under nested TokenFactors. 11858 if (ChainVal == LdVal) 11859 Ld = cast<LoadSDNode>(St->getChain()); 11860 else if (St->getValue().hasOneUse() && 11861 ChainVal->getOpcode() == ISD::TokenFactor) { 11862 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11863 if (ChainVal->getOperand(i).getNode() == LdVal) { 11864 TokenFactorIndex = i; 11865 Ld = cast<LoadSDNode>(St->getValue()); 11866 } else 11867 Ops.push_back(ChainVal->getOperand(i)); 11868 } 11869 } 11870 11871 if (!Ld || !ISD::isNormalLoad(Ld)) 11872 return SDValue(); 11873 11874 // If this is not the MMX case, i.e. we are just turning i64 load/store 11875 // into f64 load/store, avoid the transformation if there are multiple 11876 // uses of the loaded value. 11877 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11878 return SDValue(); 11879 11880 DebugLoc LdDL = Ld->getDebugLoc(); 11881 DebugLoc StDL = N->getDebugLoc(); 11882 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11883 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11884 // pair instead. 11885 if (Subtarget->is64Bit() || F64IsLegal) { 11886 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11887 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11888 Ld->getPointerInfo(), Ld->isVolatile(), 11889 Ld->isNonTemporal(), Ld->getAlignment()); 11890 SDValue NewChain = NewLd.getValue(1); 11891 if (TokenFactorIndex != -1) { 11892 Ops.push_back(NewChain); 11893 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11894 Ops.size()); 11895 } 11896 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11897 St->getPointerInfo(), 11898 St->isVolatile(), St->isNonTemporal(), 11899 St->getAlignment()); 11900 } 11901 11902 // Otherwise, lower to two pairs of 32-bit loads / stores. 11903 SDValue LoAddr = Ld->getBasePtr(); 11904 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11905 DAG.getConstant(4, MVT::i32)); 11906 11907 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11908 Ld->getPointerInfo(), 11909 Ld->isVolatile(), Ld->isNonTemporal(), 11910 Ld->getAlignment()); 11911 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11912 Ld->getPointerInfo().getWithOffset(4), 11913 Ld->isVolatile(), Ld->isNonTemporal(), 11914 MinAlign(Ld->getAlignment(), 4)); 11915 11916 SDValue NewChain = LoLd.getValue(1); 11917 if (TokenFactorIndex != -1) { 11918 Ops.push_back(LoLd); 11919 Ops.push_back(HiLd); 11920 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11921 Ops.size()); 11922 } 11923 11924 LoAddr = St->getBasePtr(); 11925 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11926 DAG.getConstant(4, MVT::i32)); 11927 11928 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11929 St->getPointerInfo(), 11930 St->isVolatile(), St->isNonTemporal(), 11931 St->getAlignment()); 11932 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11933 St->getPointerInfo().getWithOffset(4), 11934 St->isVolatile(), 11935 St->isNonTemporal(), 11936 MinAlign(St->getAlignment(), 4)); 11937 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11938 } 11939 return SDValue(); 11940} 11941 11942/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11943/// X86ISD::FXOR nodes. 11944static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11945 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11946 // F[X]OR(0.0, x) -> x 11947 // F[X]OR(x, 0.0) -> x 11948 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11949 if (C->getValueAPF().isPosZero()) 11950 return N->getOperand(1); 11951 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11952 if (C->getValueAPF().isPosZero()) 11953 return N->getOperand(0); 11954 return SDValue(); 11955} 11956 11957/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11958static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11959 // FAND(0.0, x) -> 0.0 11960 // FAND(x, 0.0) -> 0.0 11961 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11962 if (C->getValueAPF().isPosZero()) 11963 return N->getOperand(0); 11964 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11965 if (C->getValueAPF().isPosZero()) 11966 return N->getOperand(1); 11967 return SDValue(); 11968} 11969 11970static SDValue PerformBTCombine(SDNode *N, 11971 SelectionDAG &DAG, 11972 TargetLowering::DAGCombinerInfo &DCI) { 11973 // BT ignores high bits in the bit index operand. 11974 SDValue Op1 = N->getOperand(1); 11975 if (Op1.hasOneUse()) { 11976 unsigned BitWidth = Op1.getValueSizeInBits(); 11977 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11978 APInt KnownZero, KnownOne; 11979 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11980 !DCI.isBeforeLegalizeOps()); 11981 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11982 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11983 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11984 DCI.CommitTargetLoweringOpt(TLO); 11985 } 11986 return SDValue(); 11987} 11988 11989static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11990 SDValue Op = N->getOperand(0); 11991 if (Op.getOpcode() == ISD::BITCAST) 11992 Op = Op.getOperand(0); 11993 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11994 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11995 VT.getVectorElementType().getSizeInBits() == 11996 OpVT.getVectorElementType().getSizeInBits()) { 11997 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 11998 } 11999 return SDValue(); 12000} 12001 12002static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 12003 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 12004 // (and (i32 x86isd::setcc_carry), 1) 12005 // This eliminates the zext. This transformation is necessary because 12006 // ISD::SETCC is always legalized to i8. 12007 DebugLoc dl = N->getDebugLoc(); 12008 SDValue N0 = N->getOperand(0); 12009 EVT VT = N->getValueType(0); 12010 if (N0.getOpcode() == ISD::AND && 12011 N0.hasOneUse() && 12012 N0.getOperand(0).hasOneUse()) { 12013 SDValue N00 = N0.getOperand(0); 12014 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 12015 return SDValue(); 12016 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 12017 if (!C || C->getZExtValue() != 1) 12018 return SDValue(); 12019 return DAG.getNode(ISD::AND, dl, VT, 12020 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 12021 N00.getOperand(0), N00.getOperand(1)), 12022 DAG.getConstant(1, VT)); 12023 } 12024 12025 return SDValue(); 12026} 12027 12028// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 12029static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 12030 unsigned X86CC = N->getConstantOperandVal(0); 12031 SDValue EFLAG = N->getOperand(1); 12032 DebugLoc DL = N->getDebugLoc(); 12033 12034 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 12035 // a zext and produces an all-ones bit which is more useful than 0/1 in some 12036 // cases. 12037 if (X86CC == X86::COND_B) 12038 return DAG.getNode(ISD::AND, DL, MVT::i8, 12039 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 12040 DAG.getConstant(X86CC, MVT::i8), EFLAG), 12041 DAG.getConstant(1, MVT::i8)); 12042 12043 return SDValue(); 12044} 12045 12046// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 12047static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 12048 X86TargetLowering::DAGCombinerInfo &DCI) { 12049 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 12050 // the result is either zero or one (depending on the input carry bit). 12051 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 12052 if (X86::isZeroNode(N->getOperand(0)) && 12053 X86::isZeroNode(N->getOperand(1)) && 12054 // We don't have a good way to replace an EFLAGS use, so only do this when 12055 // dead right now. 12056 SDValue(N, 1).use_empty()) { 12057 DebugLoc DL = N->getDebugLoc(); 12058 EVT VT = N->getValueType(0); 12059 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 12060 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 12061 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 12062 DAG.getConstant(X86::COND_B,MVT::i8), 12063 N->getOperand(2)), 12064 DAG.getConstant(1, VT)); 12065 return DCI.CombineTo(N, Res1, CarryOut); 12066 } 12067 12068 return SDValue(); 12069} 12070 12071// fold (add Y, (sete X, 0)) -> adc 0, Y 12072// (add Y, (setne X, 0)) -> sbb -1, Y 12073// (sub (sete X, 0), Y) -> sbb 0, Y 12074// (sub (setne X, 0), Y) -> adc -1, Y 12075static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) { 12076 DebugLoc DL = N->getDebugLoc(); 12077 12078 // Look through ZExts. 12079 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 12080 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 12081 return SDValue(); 12082 12083 SDValue SetCC = Ext.getOperand(0); 12084 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 12085 return SDValue(); 12086 12087 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 12088 if (CC != X86::COND_E && CC != X86::COND_NE) 12089 return SDValue(); 12090 12091 SDValue Cmp = SetCC.getOperand(1); 12092 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 12093 !X86::isZeroNode(Cmp.getOperand(1)) || 12094 !Cmp.getOperand(0).getValueType().isInteger()) 12095 return SDValue(); 12096 12097 SDValue CmpOp0 = Cmp.getOperand(0); 12098 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 12099 DAG.getConstant(1, CmpOp0.getValueType())); 12100 12101 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 12102 if (CC == X86::COND_NE) 12103 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 12104 DL, OtherVal.getValueType(), OtherVal, 12105 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 12106 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 12107 DL, OtherVal.getValueType(), OtherVal, 12108 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 12109} 12110 12111SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 12112 DAGCombinerInfo &DCI) const { 12113 SelectionDAG &DAG = DCI.DAG; 12114 switch (N->getOpcode()) { 12115 default: break; 12116 case ISD::EXTRACT_VECTOR_ELT: 12117 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 12118 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 12119 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 12120 case ISD::ADD: 12121 case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG); 12122 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 12123 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 12124 case ISD::SHL: 12125 case ISD::SRA: 12126 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 12127 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 12128 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 12129 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 12130 case X86ISD::FXOR: 12131 case X86ISD::FOR: return PerformFORCombine(N, DAG); 12132 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 12133 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 12134 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 12135 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 12136 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 12137 case X86ISD::SHUFPS: // Handle all target specific shuffles 12138 case X86ISD::SHUFPD: 12139 case X86ISD::PALIGN: 12140 case X86ISD::PUNPCKHBW: 12141 case X86ISD::PUNPCKHWD: 12142 case X86ISD::PUNPCKHDQ: 12143 case X86ISD::PUNPCKHQDQ: 12144 case X86ISD::UNPCKHPS: 12145 case X86ISD::UNPCKHPD: 12146 case X86ISD::PUNPCKLBW: 12147 case X86ISD::PUNPCKLWD: 12148 case X86ISD::PUNPCKLDQ: 12149 case X86ISD::PUNPCKLQDQ: 12150 case X86ISD::UNPCKLPS: 12151 case X86ISD::UNPCKLPD: 12152 case X86ISD::VUNPCKLPS: 12153 case X86ISD::VUNPCKLPD: 12154 case X86ISD::VUNPCKLPSY: 12155 case X86ISD::VUNPCKLPDY: 12156 case X86ISD::MOVHLPS: 12157 case X86ISD::MOVLHPS: 12158 case X86ISD::PSHUFD: 12159 case X86ISD::PSHUFHW: 12160 case X86ISD::PSHUFLW: 12161 case X86ISD::MOVSS: 12162 case X86ISD::MOVSD: 12163 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 12164 } 12165 12166 return SDValue(); 12167} 12168 12169/// isTypeDesirableForOp - Return true if the target has native support for 12170/// the specified value type and it is 'desirable' to use the type for the 12171/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 12172/// instruction encodings are longer and some i16 instructions are slow. 12173bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 12174 if (!isTypeLegal(VT)) 12175 return false; 12176 if (VT != MVT::i16) 12177 return true; 12178 12179 switch (Opc) { 12180 default: 12181 return true; 12182 case ISD::LOAD: 12183 case ISD::SIGN_EXTEND: 12184 case ISD::ZERO_EXTEND: 12185 case ISD::ANY_EXTEND: 12186 case ISD::SHL: 12187 case ISD::SRL: 12188 case ISD::SUB: 12189 case ISD::ADD: 12190 case ISD::MUL: 12191 case ISD::AND: 12192 case ISD::OR: 12193 case ISD::XOR: 12194 return false; 12195 } 12196} 12197 12198/// IsDesirableToPromoteOp - This method query the target whether it is 12199/// beneficial for dag combiner to promote the specified node. If true, it 12200/// should return the desired promotion type by reference. 12201bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 12202 EVT VT = Op.getValueType(); 12203 if (VT != MVT::i16) 12204 return false; 12205 12206 bool Promote = false; 12207 bool Commute = false; 12208 switch (Op.getOpcode()) { 12209 default: break; 12210 case ISD::LOAD: { 12211 LoadSDNode *LD = cast<LoadSDNode>(Op); 12212 // If the non-extending load has a single use and it's not live out, then it 12213 // might be folded. 12214 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 12215 Op.hasOneUse()*/) { 12216 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12217 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 12218 // The only case where we'd want to promote LOAD (rather then it being 12219 // promoted as an operand is when it's only use is liveout. 12220 if (UI->getOpcode() != ISD::CopyToReg) 12221 return false; 12222 } 12223 } 12224 Promote = true; 12225 break; 12226 } 12227 case ISD::SIGN_EXTEND: 12228 case ISD::ZERO_EXTEND: 12229 case ISD::ANY_EXTEND: 12230 Promote = true; 12231 break; 12232 case ISD::SHL: 12233 case ISD::SRL: { 12234 SDValue N0 = Op.getOperand(0); 12235 // Look out for (store (shl (load), x)). 12236 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 12237 return false; 12238 Promote = true; 12239 break; 12240 } 12241 case ISD::ADD: 12242 case ISD::MUL: 12243 case ISD::AND: 12244 case ISD::OR: 12245 case ISD::XOR: 12246 Commute = true; 12247 // fallthrough 12248 case ISD::SUB: { 12249 SDValue N0 = Op.getOperand(0); 12250 SDValue N1 = Op.getOperand(1); 12251 if (!Commute && MayFoldLoad(N1)) 12252 return false; 12253 // Avoid disabling potential load folding opportunities. 12254 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 12255 return false; 12256 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 12257 return false; 12258 Promote = true; 12259 } 12260 } 12261 12262 PVT = MVT::i32; 12263 return Promote; 12264} 12265 12266//===----------------------------------------------------------------------===// 12267// X86 Inline Assembly Support 12268//===----------------------------------------------------------------------===// 12269 12270bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 12271 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 12272 12273 std::string AsmStr = IA->getAsmString(); 12274 12275 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 12276 SmallVector<StringRef, 4> AsmPieces; 12277 SplitString(AsmStr, AsmPieces, ";\n"); 12278 12279 switch (AsmPieces.size()) { 12280 default: return false; 12281 case 1: 12282 AsmStr = AsmPieces[0]; 12283 AsmPieces.clear(); 12284 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 12285 12286 // FIXME: this should verify that we are targeting a 486 or better. If not, 12287 // we will turn this bswap into something that will be lowered to logical ops 12288 // instead of emitting the bswap asm. For now, we don't support 486 or lower 12289 // so don't worry about this. 12290 // bswap $0 12291 if (AsmPieces.size() == 2 && 12292 (AsmPieces[0] == "bswap" || 12293 AsmPieces[0] == "bswapq" || 12294 AsmPieces[0] == "bswapl") && 12295 (AsmPieces[1] == "$0" || 12296 AsmPieces[1] == "${0:q}")) { 12297 // No need to check constraints, nothing other than the equivalent of 12298 // "=r,0" would be valid here. 12299 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12300 if (!Ty || Ty->getBitWidth() % 16 != 0) 12301 return false; 12302 return IntrinsicLowering::LowerToByteSwap(CI); 12303 } 12304 // rorw $$8, ${0:w} --> llvm.bswap.i16 12305 if (CI->getType()->isIntegerTy(16) && 12306 AsmPieces.size() == 3 && 12307 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 12308 AsmPieces[1] == "$$8," && 12309 AsmPieces[2] == "${0:w}" && 12310 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12311 AsmPieces.clear(); 12312 const std::string &ConstraintsStr = IA->getConstraintString(); 12313 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12314 std::sort(AsmPieces.begin(), AsmPieces.end()); 12315 if (AsmPieces.size() == 4 && 12316 AsmPieces[0] == "~{cc}" && 12317 AsmPieces[1] == "~{dirflag}" && 12318 AsmPieces[2] == "~{flags}" && 12319 AsmPieces[3] == "~{fpsr}") { 12320 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12321 if (!Ty || Ty->getBitWidth() % 16 != 0) 12322 return false; 12323 return IntrinsicLowering::LowerToByteSwap(CI); 12324 } 12325 } 12326 break; 12327 case 3: 12328 if (CI->getType()->isIntegerTy(32) && 12329 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12330 SmallVector<StringRef, 4> Words; 12331 SplitString(AsmPieces[0], Words, " \t,"); 12332 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12333 Words[2] == "${0:w}") { 12334 Words.clear(); 12335 SplitString(AsmPieces[1], Words, " \t,"); 12336 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 12337 Words[2] == "$0") { 12338 Words.clear(); 12339 SplitString(AsmPieces[2], Words, " \t,"); 12340 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12341 Words[2] == "${0:w}") { 12342 AsmPieces.clear(); 12343 const std::string &ConstraintsStr = IA->getConstraintString(); 12344 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12345 std::sort(AsmPieces.begin(), AsmPieces.end()); 12346 if (AsmPieces.size() == 4 && 12347 AsmPieces[0] == "~{cc}" && 12348 AsmPieces[1] == "~{dirflag}" && 12349 AsmPieces[2] == "~{flags}" && 12350 AsmPieces[3] == "~{fpsr}") { 12351 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12352 if (!Ty || Ty->getBitWidth() % 16 != 0) 12353 return false; 12354 return IntrinsicLowering::LowerToByteSwap(CI); 12355 } 12356 } 12357 } 12358 } 12359 } 12360 12361 if (CI->getType()->isIntegerTy(64)) { 12362 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 12363 if (Constraints.size() >= 2 && 12364 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 12365 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 12366 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 12367 SmallVector<StringRef, 4> Words; 12368 SplitString(AsmPieces[0], Words, " \t"); 12369 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 12370 Words.clear(); 12371 SplitString(AsmPieces[1], Words, " \t"); 12372 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 12373 Words.clear(); 12374 SplitString(AsmPieces[2], Words, " \t,"); 12375 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 12376 Words[2] == "%edx") { 12377 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12378 if (!Ty || Ty->getBitWidth() % 16 != 0) 12379 return false; 12380 return IntrinsicLowering::LowerToByteSwap(CI); 12381 } 12382 } 12383 } 12384 } 12385 } 12386 break; 12387 } 12388 return false; 12389} 12390 12391 12392 12393/// getConstraintType - Given a constraint letter, return the type of 12394/// constraint it is for this target. 12395X86TargetLowering::ConstraintType 12396X86TargetLowering::getConstraintType(const std::string &Constraint) const { 12397 if (Constraint.size() == 1) { 12398 switch (Constraint[0]) { 12399 case 'R': 12400 case 'q': 12401 case 'Q': 12402 case 'f': 12403 case 't': 12404 case 'u': 12405 case 'y': 12406 case 'x': 12407 case 'Y': 12408 return C_RegisterClass; 12409 case 'a': 12410 case 'b': 12411 case 'c': 12412 case 'd': 12413 case 'S': 12414 case 'D': 12415 case 'A': 12416 return C_Register; 12417 case 'I': 12418 case 'J': 12419 case 'K': 12420 case 'L': 12421 case 'M': 12422 case 'N': 12423 case 'G': 12424 case 'C': 12425 case 'e': 12426 case 'Z': 12427 return C_Other; 12428 default: 12429 break; 12430 } 12431 } 12432 return TargetLowering::getConstraintType(Constraint); 12433} 12434 12435/// Examine constraint type and operand type and determine a weight value. 12436/// This object must already have been set up with the operand type 12437/// and the current alternative constraint selected. 12438TargetLowering::ConstraintWeight 12439 X86TargetLowering::getSingleConstraintMatchWeight( 12440 AsmOperandInfo &info, const char *constraint) const { 12441 ConstraintWeight weight = CW_Invalid; 12442 Value *CallOperandVal = info.CallOperandVal; 12443 // If we don't have a value, we can't do a match, 12444 // but allow it at the lowest weight. 12445 if (CallOperandVal == NULL) 12446 return CW_Default; 12447 const Type *type = CallOperandVal->getType(); 12448 // Look at the constraint type. 12449 switch (*constraint) { 12450 default: 12451 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12452 case 'R': 12453 case 'q': 12454 case 'Q': 12455 case 'a': 12456 case 'b': 12457 case 'c': 12458 case 'd': 12459 case 'S': 12460 case 'D': 12461 case 'A': 12462 if (CallOperandVal->getType()->isIntegerTy()) 12463 weight = CW_SpecificReg; 12464 break; 12465 case 'f': 12466 case 't': 12467 case 'u': 12468 if (type->isFloatingPointTy()) 12469 weight = CW_SpecificReg; 12470 break; 12471 case 'y': 12472 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 12473 weight = CW_SpecificReg; 12474 break; 12475 case 'x': 12476 case 'Y': 12477 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 12478 weight = CW_Register; 12479 break; 12480 case 'I': 12481 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 12482 if (C->getZExtValue() <= 31) 12483 weight = CW_Constant; 12484 } 12485 break; 12486 case 'J': 12487 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12488 if (C->getZExtValue() <= 63) 12489 weight = CW_Constant; 12490 } 12491 break; 12492 case 'K': 12493 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12494 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 12495 weight = CW_Constant; 12496 } 12497 break; 12498 case 'L': 12499 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12500 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 12501 weight = CW_Constant; 12502 } 12503 break; 12504 case 'M': 12505 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12506 if (C->getZExtValue() <= 3) 12507 weight = CW_Constant; 12508 } 12509 break; 12510 case 'N': 12511 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12512 if (C->getZExtValue() <= 0xff) 12513 weight = CW_Constant; 12514 } 12515 break; 12516 case 'G': 12517 case 'C': 12518 if (dyn_cast<ConstantFP>(CallOperandVal)) { 12519 weight = CW_Constant; 12520 } 12521 break; 12522 case 'e': 12523 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12524 if ((C->getSExtValue() >= -0x80000000LL) && 12525 (C->getSExtValue() <= 0x7fffffffLL)) 12526 weight = CW_Constant; 12527 } 12528 break; 12529 case 'Z': 12530 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12531 if (C->getZExtValue() <= 0xffffffff) 12532 weight = CW_Constant; 12533 } 12534 break; 12535 } 12536 return weight; 12537} 12538 12539/// LowerXConstraint - try to replace an X constraint, which matches anything, 12540/// with another that has more specific requirements based on the type of the 12541/// corresponding operand. 12542const char *X86TargetLowering:: 12543LowerXConstraint(EVT ConstraintVT) const { 12544 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 12545 // 'f' like normal targets. 12546 if (ConstraintVT.isFloatingPoint()) { 12547 if (Subtarget->hasXMMInt()) 12548 return "Y"; 12549 if (Subtarget->hasXMM()) 12550 return "x"; 12551 } 12552 12553 return TargetLowering::LowerXConstraint(ConstraintVT); 12554} 12555 12556/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12557/// vector. If it is invalid, don't add anything to Ops. 12558void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12559 char Constraint, 12560 std::vector<SDValue>&Ops, 12561 SelectionDAG &DAG) const { 12562 SDValue Result(0, 0); 12563 12564 switch (Constraint) { 12565 default: break; 12566 case 'I': 12567 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12568 if (C->getZExtValue() <= 31) { 12569 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12570 break; 12571 } 12572 } 12573 return; 12574 case 'J': 12575 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12576 if (C->getZExtValue() <= 63) { 12577 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12578 break; 12579 } 12580 } 12581 return; 12582 case 'K': 12583 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12584 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 12585 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12586 break; 12587 } 12588 } 12589 return; 12590 case 'N': 12591 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12592 if (C->getZExtValue() <= 255) { 12593 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12594 break; 12595 } 12596 } 12597 return; 12598 case 'e': { 12599 // 32-bit signed value 12600 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12601 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12602 C->getSExtValue())) { 12603 // Widen to 64 bits here to get it sign extended. 12604 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 12605 break; 12606 } 12607 // FIXME gcc accepts some relocatable values here too, but only in certain 12608 // memory models; it's complicated. 12609 } 12610 return; 12611 } 12612 case 'Z': { 12613 // 32-bit unsigned value 12614 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12615 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12616 C->getZExtValue())) { 12617 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12618 break; 12619 } 12620 } 12621 // FIXME gcc accepts some relocatable values here too, but only in certain 12622 // memory models; it's complicated. 12623 return; 12624 } 12625 case 'i': { 12626 // Literal immediates are always ok. 12627 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 12628 // Widen to 64 bits here to get it sign extended. 12629 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 12630 break; 12631 } 12632 12633 // In any sort of PIC mode addresses need to be computed at runtime by 12634 // adding in a register or some sort of table lookup. These can't 12635 // be used as immediates. 12636 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 12637 return; 12638 12639 // If we are in non-pic codegen mode, we allow the address of a global (with 12640 // an optional displacement) to be used with 'i'. 12641 GlobalAddressSDNode *GA = 0; 12642 int64_t Offset = 0; 12643 12644 // Match either (GA), (GA+C), (GA+C1+C2), etc. 12645 while (1) { 12646 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 12647 Offset += GA->getOffset(); 12648 break; 12649 } else if (Op.getOpcode() == ISD::ADD) { 12650 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12651 Offset += C->getZExtValue(); 12652 Op = Op.getOperand(0); 12653 continue; 12654 } 12655 } else if (Op.getOpcode() == ISD::SUB) { 12656 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12657 Offset += -C->getZExtValue(); 12658 Op = Op.getOperand(0); 12659 continue; 12660 } 12661 } 12662 12663 // Otherwise, this isn't something we can handle, reject it. 12664 return; 12665 } 12666 12667 const GlobalValue *GV = GA->getGlobal(); 12668 // If we require an extra load to get this address, as in PIC mode, we 12669 // can't accept it. 12670 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 12671 getTargetMachine()))) 12672 return; 12673 12674 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 12675 GA->getValueType(0), Offset); 12676 break; 12677 } 12678 } 12679 12680 if (Result.getNode()) { 12681 Ops.push_back(Result); 12682 return; 12683 } 12684 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12685} 12686 12687std::vector<unsigned> X86TargetLowering:: 12688getRegClassForInlineAsmConstraint(const std::string &Constraint, 12689 EVT VT) const { 12690 if (Constraint.size() == 1) { 12691 // FIXME: not handling fp-stack yet! 12692 switch (Constraint[0]) { // GCC X86 Constraint Letters 12693 default: break; // Unknown constraint letter 12694 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 12695 if (Subtarget->is64Bit()) { 12696 if (VT == MVT::i32) 12697 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 12698 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 12699 X86::R10D,X86::R11D,X86::R12D, 12700 X86::R13D,X86::R14D,X86::R15D, 12701 X86::EBP, X86::ESP, 0); 12702 else if (VT == MVT::i16) 12703 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 12704 X86::SI, X86::DI, X86::R8W,X86::R9W, 12705 X86::R10W,X86::R11W,X86::R12W, 12706 X86::R13W,X86::R14W,X86::R15W, 12707 X86::BP, X86::SP, 0); 12708 else if (VT == MVT::i8) 12709 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 12710 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 12711 X86::R10B,X86::R11B,X86::R12B, 12712 X86::R13B,X86::R14B,X86::R15B, 12713 X86::BPL, X86::SPL, 0); 12714 12715 else if (VT == MVT::i64) 12716 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 12717 X86::RSI, X86::RDI, X86::R8, X86::R9, 12718 X86::R10, X86::R11, X86::R12, 12719 X86::R13, X86::R14, X86::R15, 12720 X86::RBP, X86::RSP, 0); 12721 12722 break; 12723 } 12724 // 32-bit fallthrough 12725 case 'Q': // Q_REGS 12726 if (VT == MVT::i32) 12727 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 12728 else if (VT == MVT::i16) 12729 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 12730 else if (VT == MVT::i8) 12731 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 12732 else if (VT == MVT::i64) 12733 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 12734 break; 12735 } 12736 } 12737 12738 return std::vector<unsigned>(); 12739} 12740 12741std::pair<unsigned, const TargetRegisterClass*> 12742X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 12743 EVT VT) const { 12744 // First, see if this is a constraint that directly corresponds to an LLVM 12745 // register class. 12746 if (Constraint.size() == 1) { 12747 // GCC Constraint Letters 12748 switch (Constraint[0]) { 12749 default: break; 12750 case 'r': // GENERAL_REGS 12751 case 'l': // INDEX_REGS 12752 if (VT == MVT::i8) 12753 return std::make_pair(0U, X86::GR8RegisterClass); 12754 if (VT == MVT::i16) 12755 return std::make_pair(0U, X86::GR16RegisterClass); 12756 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 12757 return std::make_pair(0U, X86::GR32RegisterClass); 12758 return std::make_pair(0U, X86::GR64RegisterClass); 12759 case 'R': // LEGACY_REGS 12760 if (VT == MVT::i8) 12761 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 12762 if (VT == MVT::i16) 12763 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 12764 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12765 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 12766 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 12767 case 'f': // FP Stack registers. 12768 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 12769 // value to the correct fpstack register class. 12770 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 12771 return std::make_pair(0U, X86::RFP32RegisterClass); 12772 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 12773 return std::make_pair(0U, X86::RFP64RegisterClass); 12774 return std::make_pair(0U, X86::RFP80RegisterClass); 12775 case 'y': // MMX_REGS if MMX allowed. 12776 if (!Subtarget->hasMMX()) break; 12777 return std::make_pair(0U, X86::VR64RegisterClass); 12778 case 'Y': // SSE_REGS if SSE2 allowed 12779 if (!Subtarget->hasXMMInt()) break; 12780 // FALL THROUGH. 12781 case 'x': // SSE_REGS if SSE1 allowed 12782 if (!Subtarget->hasXMM()) break; 12783 12784 switch (VT.getSimpleVT().SimpleTy) { 12785 default: break; 12786 // Scalar SSE types. 12787 case MVT::f32: 12788 case MVT::i32: 12789 return std::make_pair(0U, X86::FR32RegisterClass); 12790 case MVT::f64: 12791 case MVT::i64: 12792 return std::make_pair(0U, X86::FR64RegisterClass); 12793 // Vector types. 12794 case MVT::v16i8: 12795 case MVT::v8i16: 12796 case MVT::v4i32: 12797 case MVT::v2i64: 12798 case MVT::v4f32: 12799 case MVT::v2f64: 12800 return std::make_pair(0U, X86::VR128RegisterClass); 12801 } 12802 break; 12803 } 12804 } 12805 12806 // Use the default implementation in TargetLowering to convert the register 12807 // constraint into a member of a register class. 12808 std::pair<unsigned, const TargetRegisterClass*> Res; 12809 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 12810 12811 // Not found as a standard register? 12812 if (Res.second == 0) { 12813 // Map st(0) -> st(7) -> ST0 12814 if (Constraint.size() == 7 && Constraint[0] == '{' && 12815 tolower(Constraint[1]) == 's' && 12816 tolower(Constraint[2]) == 't' && 12817 Constraint[3] == '(' && 12818 (Constraint[4] >= '0' && Constraint[4] <= '7') && 12819 Constraint[5] == ')' && 12820 Constraint[6] == '}') { 12821 12822 Res.first = X86::ST0+Constraint[4]-'0'; 12823 Res.second = X86::RFP80RegisterClass; 12824 return Res; 12825 } 12826 12827 // GCC allows "st(0)" to be called just plain "st". 12828 if (StringRef("{st}").equals_lower(Constraint)) { 12829 Res.first = X86::ST0; 12830 Res.second = X86::RFP80RegisterClass; 12831 return Res; 12832 } 12833 12834 // flags -> EFLAGS 12835 if (StringRef("{flags}").equals_lower(Constraint)) { 12836 Res.first = X86::EFLAGS; 12837 Res.second = X86::CCRRegisterClass; 12838 return Res; 12839 } 12840 12841 // 'A' means EAX + EDX. 12842 if (Constraint == "A") { 12843 Res.first = X86::EAX; 12844 Res.second = X86::GR32_ADRegisterClass; 12845 return Res; 12846 } 12847 return Res; 12848 } 12849 12850 // Otherwise, check to see if this is a register class of the wrong value 12851 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 12852 // turn into {ax},{dx}. 12853 if (Res.second->hasType(VT)) 12854 return Res; // Correct type already, nothing to do. 12855 12856 // All of the single-register GCC register classes map their values onto 12857 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 12858 // really want an 8-bit or 32-bit register, map to the appropriate register 12859 // class and return the appropriate register. 12860 if (Res.second == X86::GR16RegisterClass) { 12861 if (VT == MVT::i8) { 12862 unsigned DestReg = 0; 12863 switch (Res.first) { 12864 default: break; 12865 case X86::AX: DestReg = X86::AL; break; 12866 case X86::DX: DestReg = X86::DL; break; 12867 case X86::CX: DestReg = X86::CL; break; 12868 case X86::BX: DestReg = X86::BL; break; 12869 } 12870 if (DestReg) { 12871 Res.first = DestReg; 12872 Res.second = X86::GR8RegisterClass; 12873 } 12874 } else if (VT == MVT::i32) { 12875 unsigned DestReg = 0; 12876 switch (Res.first) { 12877 default: break; 12878 case X86::AX: DestReg = X86::EAX; break; 12879 case X86::DX: DestReg = X86::EDX; break; 12880 case X86::CX: DestReg = X86::ECX; break; 12881 case X86::BX: DestReg = X86::EBX; break; 12882 case X86::SI: DestReg = X86::ESI; break; 12883 case X86::DI: DestReg = X86::EDI; break; 12884 case X86::BP: DestReg = X86::EBP; break; 12885 case X86::SP: DestReg = X86::ESP; break; 12886 } 12887 if (DestReg) { 12888 Res.first = DestReg; 12889 Res.second = X86::GR32RegisterClass; 12890 } 12891 } else if (VT == MVT::i64) { 12892 unsigned DestReg = 0; 12893 switch (Res.first) { 12894 default: break; 12895 case X86::AX: DestReg = X86::RAX; break; 12896 case X86::DX: DestReg = X86::RDX; break; 12897 case X86::CX: DestReg = X86::RCX; break; 12898 case X86::BX: DestReg = X86::RBX; break; 12899 case X86::SI: DestReg = X86::RSI; break; 12900 case X86::DI: DestReg = X86::RDI; break; 12901 case X86::BP: DestReg = X86::RBP; break; 12902 case X86::SP: DestReg = X86::RSP; break; 12903 } 12904 if (DestReg) { 12905 Res.first = DestReg; 12906 Res.second = X86::GR64RegisterClass; 12907 } 12908 } 12909 } else if (Res.second == X86::FR32RegisterClass || 12910 Res.second == X86::FR64RegisterClass || 12911 Res.second == X86::VR128RegisterClass) { 12912 // Handle references to XMM physical registers that got mapped into the 12913 // wrong class. This can happen with constraints like {xmm0} where the 12914 // target independent register mapper will just pick the first match it can 12915 // find, ignoring the required type. 12916 if (VT == MVT::f32) 12917 Res.second = X86::FR32RegisterClass; 12918 else if (VT == MVT::f64) 12919 Res.second = X86::FR64RegisterClass; 12920 else if (X86::VR128RegisterClass->hasType(VT)) 12921 Res.second = X86::VR128RegisterClass; 12922 } 12923 12924 return Res; 12925} 12926