X86ISelLowering.cpp revision 06cc324b9da1dc8fb7360a560343c28f5e7a940a
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/Dwarf.h" 50#include "llvm/Support/ErrorHandling.h" 51#include "llvm/Support/MathExtras.h" 52#include "llvm/Support/raw_ostream.h" 53using namespace llvm; 54using namespace dwarf; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static SDValue Insert128BitVector(SDValue Result, 63 SDValue Vec, 64 SDValue Idx, 65 SelectionDAG &DAG, 66 DebugLoc dl); 67 68static SDValue Extract128BitVector(SDValue Vec, 69 SDValue Idx, 70 SelectionDAG &DAG, 71 DebugLoc dl); 72 73static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG); 74 75 76/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 77/// sets things up to match to an AVX VEXTRACTF128 instruction or a 78/// simple subregister reference. Idx is an index in the 128 bits we 79/// want. It need not be aligned to a 128-bit bounday. That makes 80/// lowering EXTRACT_VECTOR_ELT operations easier. 81static SDValue Extract128BitVector(SDValue Vec, 82 SDValue Idx, 83 SelectionDAG &DAG, 84 DebugLoc dl) { 85 EVT VT = Vec.getValueType(); 86 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 87 88 EVT ElVT = VT.getVectorElementType(); 89 90 int Factor = VT.getSizeInBits() / 128; 91 92 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), 93 ElVT, 94 VT.getVectorNumElements() / Factor); 95 96 // Extract from UNDEF is UNDEF. 97 if (Vec.getOpcode() == ISD::UNDEF) 98 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 99 100 if (isa<ConstantSDNode>(Idx)) { 101 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 102 103 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 104 // we can match to VEXTRACTF128. 105 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 106 107 // This is the index of the first element of the 128-bit chunk 108 // we want. 109 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 110 * ElemsPerChunk); 111 112 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 113 114 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 115 VecIdx); 116 117 return Result; 118 } 119 120 return SDValue(); 121} 122 123/// Generate a DAG to put 128-bits into a vector > 128 bits. This 124/// sets things up to match to an AVX VINSERTF128 instruction or a 125/// simple superregister reference. Idx is an index in the 128 bits 126/// we want. It need not be aligned to a 128-bit bounday. That makes 127/// lowering INSERT_VECTOR_ELT operations easier. 128static SDValue Insert128BitVector(SDValue Result, 129 SDValue Vec, 130 SDValue Idx, 131 SelectionDAG &DAG, 132 DebugLoc dl) { 133 if (isa<ConstantSDNode>(Idx)) { 134 EVT VT = Vec.getValueType(); 135 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 136 137 EVT ElVT = VT.getVectorElementType(); 138 139 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 140 141 EVT ResultVT = Result.getValueType(); 142 143 // Insert the relevant 128 bits. 144 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 145 146 // This is the index of the first element of the 128-bit chunk 147 // we want. 148 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 149 * ElemsPerChunk); 150 151 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 152 153 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 154 VecIdx); 155 return Result; 156 } 157 158 return SDValue(); 159} 160 161/// Given two vectors, concat them. 162static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) { 163 DebugLoc dl = Lower.getDebugLoc(); 164 165 assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!"); 166 167 EVT VT = EVT::getVectorVT(*DAG.getContext(), 168 Lower.getValueType().getVectorElementType(), 169 Lower.getValueType().getVectorNumElements() * 2); 170 171 // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors). 172 assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!"); 173 174 // Insert the upper subvector. 175 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper, 176 DAG.getConstant( 177 // This is half the length of the result 178 // vector. Start inserting the upper 128 179 // bits here. 180 Lower.getValueType().getVectorNumElements(), 181 MVT::i32), 182 DAG, dl); 183 184 // Insert the lower subvector. 185 Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl); 186 return Vec; 187} 188 189static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 190 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 191 bool is64Bit = Subtarget->is64Bit(); 192 193 if (Subtarget->isTargetEnvMacho()) { 194 if (is64Bit) 195 return new X8664_MachoTargetObjectFile(); 196 return new TargetLoweringObjectFileMachO(); 197 } 198 199 if (Subtarget->isTargetELF()) { 200 if (is64Bit) 201 return new X8664_ELFTargetObjectFile(TM); 202 return new X8632_ELFTargetObjectFile(TM); 203 } 204 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 205 return new TargetLoweringObjectFileCOFF(); 206 llvm_unreachable("unknown subtarget type"); 207} 208 209X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 210 : TargetLowering(TM, createTLOF(TM)) { 211 Subtarget = &TM.getSubtarget<X86Subtarget>(); 212 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 213 X86ScalarSSEf32 = Subtarget->hasXMM(); 214 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 215 216 RegInfo = TM.getRegisterInfo(); 217 TD = getTargetData(); 218 219 // Set up the TargetLowering object. 220 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 221 222 // X86 is weird, it always uses i8 for shift amounts and setcc results. 223 setBooleanContents(ZeroOrOneBooleanContent); 224 225 // For 64-bit since we have so many registers use the ILP scheduler, for 226 // 32-bit code use the register pressure specific scheduling. 227 if (Subtarget->is64Bit()) 228 setSchedulingPreference(Sched::ILP); 229 else 230 setSchedulingPreference(Sched::RegPressure); 231 setStackPointerRegisterToSaveRestore(X86StackPtr); 232 233 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 234 // Setup Windows compiler runtime calls. 235 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 236 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 237 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 238 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 239 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 240 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 241 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 242 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 243 } 244 245 if (Subtarget->isTargetDarwin()) { 246 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 247 setUseUnderscoreSetJmp(false); 248 setUseUnderscoreLongJmp(false); 249 } else if (Subtarget->isTargetMingw()) { 250 // MS runtime is weird: it exports _setjmp, but longjmp! 251 setUseUnderscoreSetJmp(true); 252 setUseUnderscoreLongJmp(false); 253 } else { 254 setUseUnderscoreSetJmp(true); 255 setUseUnderscoreLongJmp(true); 256 } 257 258 // Set up the register classes. 259 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 260 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 261 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 262 if (Subtarget->is64Bit()) 263 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 264 265 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 266 267 // We don't accept any truncstore of integer registers. 268 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 269 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 270 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 271 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 272 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 273 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 274 275 // SETOEQ and SETUNE require checking two conditions. 276 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 277 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 278 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 279 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 280 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 281 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 282 283 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 284 // operation. 285 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 286 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 287 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 288 289 if (Subtarget->is64Bit()) { 290 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 291 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 292 } else if (!UseSoftFloat) { 293 // We have an algorithm for SSE2->double, and we turn this into a 294 // 64-bit FILD followed by conditional FADD for other targets. 295 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 296 // We have an algorithm for SSE2, and we turn this into a 64-bit 297 // FILD for other targets. 298 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 299 } 300 301 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 302 // this operation. 303 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 304 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 305 306 if (!UseSoftFloat) { 307 // SSE has no i16 to fp conversion, only i32 308 if (X86ScalarSSEf32) { 309 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 310 // f32 and f64 cases are Legal, f80 case is not 311 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 312 } else { 313 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 314 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 315 } 316 } else { 317 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 318 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 319 } 320 321 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 322 // are Legal, f80 is custom lowered. 323 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 324 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 325 326 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 327 // this operation. 328 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 329 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 330 331 if (X86ScalarSSEf32) { 332 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 333 // f32 and f64 cases are Legal, f80 case is not 334 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 335 } else { 336 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 337 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 338 } 339 340 // Handle FP_TO_UINT by promoting the destination to a larger signed 341 // conversion. 342 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 343 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 344 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 345 346 if (Subtarget->is64Bit()) { 347 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 348 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 349 } else if (!UseSoftFloat) { 350 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 351 // Expand FP_TO_UINT into a select. 352 // FIXME: We would like to use a Custom expander here eventually to do 353 // the optimal thing for SSE vs. the default expansion in the legalizer. 354 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 355 else 356 // With SSE3 we can use fisttpll to convert to a signed i64; without 357 // SSE, we're stuck with a fistpll. 358 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 359 } 360 361 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 362 if (!X86ScalarSSEf64) { 363 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 364 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 365 if (Subtarget->is64Bit()) { 366 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 367 // Without SSE, i64->f64 goes through memory. 368 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 369 } 370 } 371 372 // Scalar integer divide and remainder are lowered to use operations that 373 // produce two results, to match the available instructions. This exposes 374 // the two-result form to trivial CSE, which is able to combine x/y and x%y 375 // into a single instruction. 376 // 377 // Scalar integer multiply-high is also lowered to use two-result 378 // operations, to match the available instructions. However, plain multiply 379 // (low) operations are left as Legal, as there are single-result 380 // instructions for this in x86. Using the two-result multiply instructions 381 // when both high and low results are needed must be arranged by dagcombine. 382 for (unsigned i = 0, e = 4; i != e; ++i) { 383 MVT VT = IntVTs[i]; 384 setOperationAction(ISD::MULHS, VT, Expand); 385 setOperationAction(ISD::MULHU, VT, Expand); 386 setOperationAction(ISD::SDIV, VT, Expand); 387 setOperationAction(ISD::UDIV, VT, Expand); 388 setOperationAction(ISD::SREM, VT, Expand); 389 setOperationAction(ISD::UREM, VT, Expand); 390 391 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 392 setOperationAction(ISD::ADDC, VT, Custom); 393 setOperationAction(ISD::ADDE, VT, Custom); 394 setOperationAction(ISD::SUBC, VT, Custom); 395 setOperationAction(ISD::SUBE, VT, Custom); 396 } 397 398 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 399 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 400 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 401 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 402 if (Subtarget->is64Bit()) 403 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 407 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 408 setOperationAction(ISD::FREM , MVT::f32 , Expand); 409 setOperationAction(ISD::FREM , MVT::f64 , Expand); 410 setOperationAction(ISD::FREM , MVT::f80 , Expand); 411 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 412 413 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 414 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 415 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 416 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 417 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 418 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 419 if (Subtarget->is64Bit()) { 420 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 421 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 422 } 423 424 if (Subtarget->hasPOPCNT()) { 425 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 426 } else { 427 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 428 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 429 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 430 if (Subtarget->is64Bit()) 431 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 432 } 433 434 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 435 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 436 437 // These should be promoted to a larger select which is supported. 438 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 439 // X86 wants to expand cmov itself. 440 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 441 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 442 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 443 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 444 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 445 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 446 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 447 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 448 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 449 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 450 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 451 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 452 if (Subtarget->is64Bit()) { 453 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 454 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 455 } 456 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 457 458 // Darwin ABI issue. 459 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 460 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 461 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 462 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 463 if (Subtarget->is64Bit()) 464 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 465 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 466 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 467 if (Subtarget->is64Bit()) { 468 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 469 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 470 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 471 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 472 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 473 } 474 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 475 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 476 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 477 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 478 if (Subtarget->is64Bit()) { 479 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 480 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 481 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 482 } 483 484 if (Subtarget->hasXMM()) 485 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 486 487 // We may not have a libcall for MEMBARRIER so we should lower this. 488 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 489 490 // On X86 and X86-64, atomic operations are lowered to locked instructions. 491 // Locked instructions, in turn, have implicit fence semantics (all memory 492 // operations are flushed before issuing the locked instruction, and they 493 // are not buffered), so we can fold away the common pattern of 494 // fence-atomic-fence. 495 setShouldFoldAtomicFences(true); 496 497 // Expand certain atomics 498 for (unsigned i = 0, e = 4; i != e; ++i) { 499 MVT VT = IntVTs[i]; 500 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 501 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 502 } 503 504 if (!Subtarget->is64Bit()) { 505 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 506 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 507 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 508 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 509 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 511 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 512 } 513 514 // FIXME - use subtarget debug flags 515 if (!Subtarget->isTargetDarwin() && 516 !Subtarget->isTargetELF() && 517 !Subtarget->isTargetCygMing()) { 518 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 519 } 520 521 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 522 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 523 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 524 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 525 if (Subtarget->is64Bit()) { 526 setExceptionPointerRegister(X86::RAX); 527 setExceptionSelectorRegister(X86::RDX); 528 } else { 529 setExceptionPointerRegister(X86::EAX); 530 setExceptionSelectorRegister(X86::EDX); 531 } 532 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 533 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 534 535 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 536 537 setOperationAction(ISD::TRAP, MVT::Other, Legal); 538 539 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 540 setOperationAction(ISD::VASTART , MVT::Other, Custom); 541 setOperationAction(ISD::VAEND , MVT::Other, Expand); 542 if (Subtarget->is64Bit()) { 543 setOperationAction(ISD::VAARG , MVT::Other, Custom); 544 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 545 } else { 546 setOperationAction(ISD::VAARG , MVT::Other, Expand); 547 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 548 } 549 550 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 551 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 552 if (Subtarget->is64Bit()) 553 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 554 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 555 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 556 else 557 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 558 559 if (!UseSoftFloat && X86ScalarSSEf64) { 560 // f32 and f64 use SSE. 561 // Set up the FP register classes. 562 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 563 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 564 565 // Use ANDPD to simulate FABS. 566 setOperationAction(ISD::FABS , MVT::f64, Custom); 567 setOperationAction(ISD::FABS , MVT::f32, Custom); 568 569 // Use XORP to simulate FNEG. 570 setOperationAction(ISD::FNEG , MVT::f64, Custom); 571 setOperationAction(ISD::FNEG , MVT::f32, Custom); 572 573 // Use ANDPD and ORPD to simulate FCOPYSIGN. 574 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 575 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 576 577 // We don't support sin/cos/fmod 578 setOperationAction(ISD::FSIN , MVT::f64, Expand); 579 setOperationAction(ISD::FCOS , MVT::f64, Expand); 580 setOperationAction(ISD::FSIN , MVT::f32, Expand); 581 setOperationAction(ISD::FCOS , MVT::f32, Expand); 582 583 // Expand FP immediates into loads from the stack, except for the special 584 // cases we handle. 585 addLegalFPImmediate(APFloat(+0.0)); // xorpd 586 addLegalFPImmediate(APFloat(+0.0f)); // xorps 587 } else if (!UseSoftFloat && X86ScalarSSEf32) { 588 // Use SSE for f32, x87 for f64. 589 // Set up the FP register classes. 590 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 593 // Use ANDPS to simulate FABS. 594 setOperationAction(ISD::FABS , MVT::f32, Custom); 595 596 // Use XORP to simulate FNEG. 597 setOperationAction(ISD::FNEG , MVT::f32, Custom); 598 599 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 600 601 // Use ANDPS and ORPS to simulate FCOPYSIGN. 602 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 603 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 604 605 // We don't support sin/cos/fmod 606 setOperationAction(ISD::FSIN , MVT::f32, Expand); 607 setOperationAction(ISD::FCOS , MVT::f32, Expand); 608 609 // Special cases we handle for FP constants. 610 addLegalFPImmediate(APFloat(+0.0f)); // xorps 611 addLegalFPImmediate(APFloat(+0.0)); // FLD0 612 addLegalFPImmediate(APFloat(+1.0)); // FLD1 613 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 614 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 615 616 if (!UnsafeFPMath) { 617 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 618 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 619 } 620 } else if (!UseSoftFloat) { 621 // f32 and f64 in x87. 622 // Set up the FP register classes. 623 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 624 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 625 626 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 627 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 628 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 629 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 630 631 if (!UnsafeFPMath) { 632 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 633 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 634 } 635 addLegalFPImmediate(APFloat(+0.0)); // FLD0 636 addLegalFPImmediate(APFloat(+1.0)); // FLD1 637 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 638 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 639 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 640 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 641 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 642 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 643 } 644 645 // Long double always uses X87. 646 if (!UseSoftFloat) { 647 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 648 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 649 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 650 { 651 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 652 addLegalFPImmediate(TmpFlt); // FLD0 653 TmpFlt.changeSign(); 654 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 655 656 bool ignored; 657 APFloat TmpFlt2(+1.0); 658 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 659 &ignored); 660 addLegalFPImmediate(TmpFlt2); // FLD1 661 TmpFlt2.changeSign(); 662 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 663 } 664 665 if (!UnsafeFPMath) { 666 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 667 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 668 } 669 } 670 671 // Always use a library call for pow. 672 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 673 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 674 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 675 676 setOperationAction(ISD::FLOG, MVT::f80, Expand); 677 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 678 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 679 setOperationAction(ISD::FEXP, MVT::f80, Expand); 680 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 681 682 // First set operation action for all vector types to either promote 683 // (for widening) or expand (for scalarization). Then we will selectively 684 // turn on ones that can be effectively codegen'd. 685 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 686 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 687 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 702 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 704 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 705 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 711 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 717 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 721 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 722 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 725 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 727 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 728 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 729 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 730 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 731 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 737 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 739 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 740 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 741 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 742 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 743 setTruncStoreAction((MVT::SimpleValueType)VT, 744 (MVT::SimpleValueType)InnerVT, Expand); 745 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 746 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 747 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 748 } 749 750 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 751 // with -msoft-float, disable use of MMX as well. 752 if (!UseSoftFloat && Subtarget->hasMMX()) { 753 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 754 // No operations on x86mmx supported, everything uses intrinsics. 755 } 756 757 // MMX-sized vectors (other than x86mmx) are expected to be expanded 758 // into smaller operations. 759 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 760 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 761 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 762 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 763 setOperationAction(ISD::AND, MVT::v8i8, Expand); 764 setOperationAction(ISD::AND, MVT::v4i16, Expand); 765 setOperationAction(ISD::AND, MVT::v2i32, Expand); 766 setOperationAction(ISD::AND, MVT::v1i64, Expand); 767 setOperationAction(ISD::OR, MVT::v8i8, Expand); 768 setOperationAction(ISD::OR, MVT::v4i16, Expand); 769 setOperationAction(ISD::OR, MVT::v2i32, Expand); 770 setOperationAction(ISD::OR, MVT::v1i64, Expand); 771 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 772 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 773 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 774 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 775 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 776 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 777 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 778 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 779 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 780 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 781 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 782 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 783 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 784 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 785 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 786 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 787 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 788 789 if (!UseSoftFloat && Subtarget->hasXMM()) { 790 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 791 792 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 793 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 794 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 795 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 796 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 797 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 798 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 799 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 800 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 801 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 802 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 803 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 804 } 805 806 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 807 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 808 809 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 810 // registers cannot be used even for integer operations. 811 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 812 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 813 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 814 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 815 816 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 817 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 818 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 819 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 820 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 821 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 822 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 823 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 824 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 825 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 826 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 827 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 828 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 829 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 830 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 831 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 832 833 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 834 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 835 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 836 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 837 838 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 839 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 840 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 841 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 842 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 843 844 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 845 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 846 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 847 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 848 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 849 850 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 851 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 852 EVT VT = (MVT::SimpleValueType)i; 853 // Do not attempt to custom lower non-power-of-2 vectors 854 if (!isPowerOf2_32(VT.getVectorNumElements())) 855 continue; 856 // Do not attempt to custom lower non-128-bit vectors 857 if (!VT.is128BitVector()) 858 continue; 859 setOperationAction(ISD::BUILD_VECTOR, 860 VT.getSimpleVT().SimpleTy, Custom); 861 setOperationAction(ISD::VECTOR_SHUFFLE, 862 VT.getSimpleVT().SimpleTy, Custom); 863 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 864 VT.getSimpleVT().SimpleTy, Custom); 865 } 866 867 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 868 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 869 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 870 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 871 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 872 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 873 874 if (Subtarget->is64Bit()) { 875 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 876 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 877 } 878 879 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 880 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 881 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 882 EVT VT = SVT; 883 884 // Do not attempt to promote non-128-bit vectors 885 if (!VT.is128BitVector()) 886 continue; 887 888 setOperationAction(ISD::AND, SVT, Promote); 889 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 890 setOperationAction(ISD::OR, SVT, Promote); 891 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 892 setOperationAction(ISD::XOR, SVT, Promote); 893 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 894 setOperationAction(ISD::LOAD, SVT, Promote); 895 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 896 setOperationAction(ISD::SELECT, SVT, Promote); 897 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 898 } 899 900 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 901 902 // Custom lower v2i64 and v2f64 selects. 903 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 904 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 905 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 906 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 907 908 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 909 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 910 } 911 912 if (Subtarget->hasSSE41()) { 913 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 914 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 915 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 916 setOperationAction(ISD::FRINT, MVT::f32, Legal); 917 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 918 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 919 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 920 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 921 setOperationAction(ISD::FRINT, MVT::f64, Legal); 922 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 923 924 // FIXME: Do we need to handle scalar-to-vector here? 925 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 926 927 // Can turn SHL into an integer multiply. 928 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 929 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 930 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 931 932 // i8 and i16 vectors are custom , because the source register and source 933 // source memory operand types are not the same width. f32 vectors are 934 // custom since the immediate controlling the insert encodes additional 935 // information. 936 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 937 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 938 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 939 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 940 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 942 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 943 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 944 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 945 946 if (Subtarget->is64Bit()) { 947 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 948 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 949 } 950 } 951 952 if (Subtarget->hasSSE42()) 953 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 954 955 if (!UseSoftFloat && Subtarget->hasAVX()) { 956 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 957 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 958 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 959 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 960 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 961 962 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 963 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 964 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 965 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 966 967 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 968 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 969 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 970 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 971 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 972 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 973 974 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 975 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 976 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 977 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 978 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 979 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 980 981 // Custom lower build_vector, vector_shuffle, scalar_to_vector, 982 // insert_vector_elt extract_subvector and extract_vector_elt for 983 // 256-bit types. 984 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 985 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 986 ++i) { 987 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 988 // Do not attempt to custom lower non-256-bit vectors 989 if (!isPowerOf2_32(MVT(VT).getVectorNumElements()) 990 || (MVT(VT).getSizeInBits() < 256)) 991 continue; 992 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 993 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 994 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 995 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 996 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 997 } 998 // Custom-lower insert_subvector and extract_subvector based on 999 // the result type. 1000 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1001 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 1002 ++i) { 1003 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 1004 // Do not attempt to custom lower non-256-bit vectors 1005 if (!isPowerOf2_32(MVT(VT).getVectorNumElements())) 1006 continue; 1007 1008 if (MVT(VT).getSizeInBits() == 128) { 1009 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1010 } 1011 else if (MVT(VT).getSizeInBits() == 256) { 1012 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1013 } 1014 } 1015 1016 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1017 // Don't promote loads because we need them for VPERM vector index versions. 1018 1019 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1020 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; 1021 VT++) { 1022 if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements()) 1023 || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256)) 1024 continue; 1025 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 1026 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64); 1027 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 1028 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64); 1029 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 1030 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64); 1031 //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 1032 //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64); 1033 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 1034 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64); 1035 } 1036 } 1037 1038 // We want to custom lower some of our intrinsics. 1039 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1040 1041 1042 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1043 // handle type legalization for these operations here. 1044 // 1045 // FIXME: We really should do custom legalization for addition and 1046 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1047 // than generic legalization for 64-bit multiplication-with-overflow, though. 1048 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1049 // Add/Sub/Mul with overflow operations are custom lowered. 1050 MVT VT = IntVTs[i]; 1051 setOperationAction(ISD::SADDO, VT, Custom); 1052 setOperationAction(ISD::UADDO, VT, Custom); 1053 setOperationAction(ISD::SSUBO, VT, Custom); 1054 setOperationAction(ISD::USUBO, VT, Custom); 1055 setOperationAction(ISD::SMULO, VT, Custom); 1056 setOperationAction(ISD::UMULO, VT, Custom); 1057 } 1058 1059 // There are no 8-bit 3-address imul/mul instructions 1060 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1061 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1062 1063 if (!Subtarget->is64Bit()) { 1064 // These libcalls are not available in 32-bit. 1065 setLibcallName(RTLIB::SHL_I128, 0); 1066 setLibcallName(RTLIB::SRL_I128, 0); 1067 setLibcallName(RTLIB::SRA_I128, 0); 1068 } 1069 1070 // We have target-specific dag combine patterns for the following nodes: 1071 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1072 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1073 setTargetDAGCombine(ISD::BUILD_VECTOR); 1074 setTargetDAGCombine(ISD::SELECT); 1075 setTargetDAGCombine(ISD::SHL); 1076 setTargetDAGCombine(ISD::SRA); 1077 setTargetDAGCombine(ISD::SRL); 1078 setTargetDAGCombine(ISD::OR); 1079 setTargetDAGCombine(ISD::AND); 1080 setTargetDAGCombine(ISD::ADD); 1081 setTargetDAGCombine(ISD::SUB); 1082 setTargetDAGCombine(ISD::STORE); 1083 setTargetDAGCombine(ISD::ZERO_EXTEND); 1084 if (Subtarget->is64Bit()) 1085 setTargetDAGCombine(ISD::MUL); 1086 1087 computeRegisterProperties(); 1088 1089 // On Darwin, -Os means optimize for size without hurting performance, 1090 // do not reduce the limit. 1091 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1092 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1093 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1094 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1095 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1096 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1097 setPrefLoopAlignment(16); 1098 benefitFromCodePlacementOpt = true; 1099} 1100 1101 1102MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1103 return MVT::i8; 1104} 1105 1106 1107/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1108/// the desired ByVal argument alignment. 1109static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1110 if (MaxAlign == 16) 1111 return; 1112 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1113 if (VTy->getBitWidth() == 128) 1114 MaxAlign = 16; 1115 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1116 unsigned EltAlign = 0; 1117 getMaxByValAlign(ATy->getElementType(), EltAlign); 1118 if (EltAlign > MaxAlign) 1119 MaxAlign = EltAlign; 1120 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1121 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1122 unsigned EltAlign = 0; 1123 getMaxByValAlign(STy->getElementType(i), EltAlign); 1124 if (EltAlign > MaxAlign) 1125 MaxAlign = EltAlign; 1126 if (MaxAlign == 16) 1127 break; 1128 } 1129 } 1130 return; 1131} 1132 1133/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1134/// function arguments in the caller parameter area. For X86, aggregates 1135/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1136/// are at 4-byte boundaries. 1137unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1138 if (Subtarget->is64Bit()) { 1139 // Max of 8 and alignment of type. 1140 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1141 if (TyAlign > 8) 1142 return TyAlign; 1143 return 8; 1144 } 1145 1146 unsigned Align = 4; 1147 if (Subtarget->hasXMM()) 1148 getMaxByValAlign(Ty, Align); 1149 return Align; 1150} 1151 1152/// getOptimalMemOpType - Returns the target specific optimal type for load 1153/// and store operations as a result of memset, memcpy, and memmove 1154/// lowering. If DstAlign is zero that means it's safe to destination 1155/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1156/// means there isn't a need to check it against alignment requirement, 1157/// probably because the source does not need to be loaded. If 1158/// 'NonScalarIntSafe' is true, that means it's safe to return a 1159/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1160/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1161/// constant so it does not need to be loaded. 1162/// It returns EVT::Other if the type should be determined using generic 1163/// target-independent logic. 1164EVT 1165X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1166 unsigned DstAlign, unsigned SrcAlign, 1167 bool NonScalarIntSafe, 1168 bool MemcpyStrSrc, 1169 MachineFunction &MF) const { 1170 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1171 // linux. This is because the stack realignment code can't handle certain 1172 // cases like PR2962. This should be removed when PR2962 is fixed. 1173 const Function *F = MF.getFunction(); 1174 if (NonScalarIntSafe && 1175 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1176 if (Size >= 16 && 1177 (Subtarget->isUnalignedMemAccessFast() || 1178 ((DstAlign == 0 || DstAlign >= 16) && 1179 (SrcAlign == 0 || SrcAlign >= 16))) && 1180 Subtarget->getStackAlignment() >= 16) { 1181 if (Subtarget->hasSSE2()) 1182 return MVT::v4i32; 1183 if (Subtarget->hasSSE1()) 1184 return MVT::v4f32; 1185 } else if (!MemcpyStrSrc && Size >= 8 && 1186 !Subtarget->is64Bit() && 1187 Subtarget->getStackAlignment() >= 8 && 1188 Subtarget->hasXMMInt()) { 1189 // Do not use f64 to lower memcpy if source is string constant. It's 1190 // better to use i32 to avoid the loads. 1191 return MVT::f64; 1192 } 1193 } 1194 if (Subtarget->is64Bit() && Size >= 8) 1195 return MVT::i64; 1196 return MVT::i32; 1197} 1198 1199/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1200/// current function. The returned value is a member of the 1201/// MachineJumpTableInfo::JTEntryKind enum. 1202unsigned X86TargetLowering::getJumpTableEncoding() const { 1203 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1204 // symbol. 1205 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1206 Subtarget->isPICStyleGOT()) 1207 return MachineJumpTableInfo::EK_Custom32; 1208 1209 // Otherwise, use the normal jump table encoding heuristics. 1210 return TargetLowering::getJumpTableEncoding(); 1211} 1212 1213const MCExpr * 1214X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1215 const MachineBasicBlock *MBB, 1216 unsigned uid,MCContext &Ctx) const{ 1217 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1218 Subtarget->isPICStyleGOT()); 1219 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1220 // entries. 1221 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1222 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1223} 1224 1225/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1226/// jumptable. 1227SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1228 SelectionDAG &DAG) const { 1229 if (!Subtarget->is64Bit()) 1230 // This doesn't have DebugLoc associated with it, but is not really the 1231 // same as a Register. 1232 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1233 return Table; 1234} 1235 1236/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1237/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1238/// MCExpr. 1239const MCExpr *X86TargetLowering:: 1240getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1241 MCContext &Ctx) const { 1242 // X86-64 uses RIP relative addressing based on the jump table label. 1243 if (Subtarget->isPICStyleRIPRel()) 1244 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1245 1246 // Otherwise, the reference is relative to the PIC base. 1247 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1248} 1249 1250/// getFunctionAlignment - Return the Log2 alignment of this function. 1251unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1252 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1253} 1254 1255// FIXME: Why this routine is here? Move to RegInfo! 1256std::pair<const TargetRegisterClass*, uint8_t> 1257X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1258 const TargetRegisterClass *RRC = 0; 1259 uint8_t Cost = 1; 1260 switch (VT.getSimpleVT().SimpleTy) { 1261 default: 1262 return TargetLowering::findRepresentativeClass(VT); 1263 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1264 RRC = (Subtarget->is64Bit() 1265 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1266 break; 1267 case MVT::x86mmx: 1268 RRC = X86::VR64RegisterClass; 1269 break; 1270 case MVT::f32: case MVT::f64: 1271 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1272 case MVT::v4f32: case MVT::v2f64: 1273 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1274 case MVT::v4f64: 1275 RRC = X86::VR128RegisterClass; 1276 break; 1277 } 1278 return std::make_pair(RRC, Cost); 1279} 1280 1281bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1282 unsigned &Offset) const { 1283 if (!Subtarget->isTargetLinux()) 1284 return false; 1285 1286 if (Subtarget->is64Bit()) { 1287 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1288 Offset = 0x28; 1289 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1290 AddressSpace = 256; 1291 else 1292 AddressSpace = 257; 1293 } else { 1294 // %gs:0x14 on i386 1295 Offset = 0x14; 1296 AddressSpace = 256; 1297 } 1298 return true; 1299} 1300 1301 1302//===----------------------------------------------------------------------===// 1303// Return Value Calling Convention Implementation 1304//===----------------------------------------------------------------------===// 1305 1306#include "X86GenCallingConv.inc" 1307 1308bool 1309X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1310 const SmallVectorImpl<ISD::OutputArg> &Outs, 1311 LLVMContext &Context) const { 1312 SmallVector<CCValAssign, 16> RVLocs; 1313 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1314 RVLocs, Context); 1315 return CCInfo.CheckReturn(Outs, RetCC_X86); 1316} 1317 1318SDValue 1319X86TargetLowering::LowerReturn(SDValue Chain, 1320 CallingConv::ID CallConv, bool isVarArg, 1321 const SmallVectorImpl<ISD::OutputArg> &Outs, 1322 const SmallVectorImpl<SDValue> &OutVals, 1323 DebugLoc dl, SelectionDAG &DAG) const { 1324 MachineFunction &MF = DAG.getMachineFunction(); 1325 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1326 1327 SmallVector<CCValAssign, 16> RVLocs; 1328 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1329 RVLocs, *DAG.getContext()); 1330 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1331 1332 // Add the regs to the liveout set for the function. 1333 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1334 for (unsigned i = 0; i != RVLocs.size(); ++i) 1335 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1336 MRI.addLiveOut(RVLocs[i].getLocReg()); 1337 1338 SDValue Flag; 1339 1340 SmallVector<SDValue, 6> RetOps; 1341 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1342 // Operand #1 = Bytes To Pop 1343 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1344 MVT::i16)); 1345 1346 // Copy the result values into the output registers. 1347 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1348 CCValAssign &VA = RVLocs[i]; 1349 assert(VA.isRegLoc() && "Can only return in registers!"); 1350 SDValue ValToCopy = OutVals[i]; 1351 EVT ValVT = ValToCopy.getValueType(); 1352 1353 // If this is x86-64, and we disabled SSE, we can't return FP values, 1354 // or SSE or MMX vectors. 1355 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1356 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1357 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1358 report_fatal_error("SSE register return with SSE disabled"); 1359 } 1360 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1361 // llvm-gcc has never done it right and no one has noticed, so this 1362 // should be OK for now. 1363 if (ValVT == MVT::f64 && 1364 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1365 report_fatal_error("SSE2 register return with SSE2 disabled"); 1366 1367 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1368 // the RET instruction and handled by the FP Stackifier. 1369 if (VA.getLocReg() == X86::ST0 || 1370 VA.getLocReg() == X86::ST1) { 1371 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1372 // change the value to the FP stack register class. 1373 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1374 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1375 RetOps.push_back(ValToCopy); 1376 // Don't emit a copytoreg. 1377 continue; 1378 } 1379 1380 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1381 // which is returned in RAX / RDX. 1382 if (Subtarget->is64Bit()) { 1383 if (ValVT == MVT::x86mmx) { 1384 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1385 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1386 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1387 ValToCopy); 1388 // If we don't have SSE2 available, convert to v4f32 so the generated 1389 // register is legal. 1390 if (!Subtarget->hasSSE2()) 1391 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1392 } 1393 } 1394 } 1395 1396 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1397 Flag = Chain.getValue(1); 1398 } 1399 1400 // The x86-64 ABI for returning structs by value requires that we copy 1401 // the sret argument into %rax for the return. We saved the argument into 1402 // a virtual register in the entry block, so now we copy the value out 1403 // and into %rax. 1404 if (Subtarget->is64Bit() && 1405 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1406 MachineFunction &MF = DAG.getMachineFunction(); 1407 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1408 unsigned Reg = FuncInfo->getSRetReturnReg(); 1409 assert(Reg && 1410 "SRetReturnReg should have been set in LowerFormalArguments()."); 1411 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1412 1413 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1414 Flag = Chain.getValue(1); 1415 1416 // RAX now acts like a return value. 1417 MRI.addLiveOut(X86::RAX); 1418 } 1419 1420 RetOps[0] = Chain; // Update chain. 1421 1422 // Add the flag if we have it. 1423 if (Flag.getNode()) 1424 RetOps.push_back(Flag); 1425 1426 return DAG.getNode(X86ISD::RET_FLAG, dl, 1427 MVT::Other, &RetOps[0], RetOps.size()); 1428} 1429 1430bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1431 if (N->getNumValues() != 1) 1432 return false; 1433 if (!N->hasNUsesOfValue(1, 0)) 1434 return false; 1435 1436 SDNode *Copy = *N->use_begin(); 1437 if (Copy->getOpcode() != ISD::CopyToReg && 1438 Copy->getOpcode() != ISD::FP_EXTEND) 1439 return false; 1440 1441 bool HasRet = false; 1442 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1443 UI != UE; ++UI) { 1444 if (UI->getOpcode() != X86ISD::RET_FLAG) 1445 return false; 1446 HasRet = true; 1447 } 1448 1449 return HasRet; 1450} 1451 1452EVT 1453X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1454 ISD::NodeType ExtendKind) const { 1455 MVT ReturnMVT; 1456 // TODO: Is this also valid on 32-bit? 1457 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1458 ReturnMVT = MVT::i8; 1459 else 1460 ReturnMVT = MVT::i32; 1461 1462 EVT MinVT = getRegisterType(Context, ReturnMVT); 1463 return VT.bitsLT(MinVT) ? MinVT : VT; 1464} 1465 1466/// LowerCallResult - Lower the result values of a call into the 1467/// appropriate copies out of appropriate physical registers. 1468/// 1469SDValue 1470X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1471 CallingConv::ID CallConv, bool isVarArg, 1472 const SmallVectorImpl<ISD::InputArg> &Ins, 1473 DebugLoc dl, SelectionDAG &DAG, 1474 SmallVectorImpl<SDValue> &InVals) const { 1475 1476 // Assign locations to each value returned by this call. 1477 SmallVector<CCValAssign, 16> RVLocs; 1478 bool Is64Bit = Subtarget->is64Bit(); 1479 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1480 RVLocs, *DAG.getContext()); 1481 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1482 1483 // Copy all of the result registers out of their specified physreg. 1484 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1485 CCValAssign &VA = RVLocs[i]; 1486 EVT CopyVT = VA.getValVT(); 1487 1488 // If this is x86-64, and we disabled SSE, we can't return FP values 1489 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1490 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1491 report_fatal_error("SSE register return with SSE disabled"); 1492 } 1493 1494 SDValue Val; 1495 1496 // If this is a call to a function that returns an fp value on the floating 1497 // point stack, we must guarantee the the value is popped from the stack, so 1498 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1499 // if the return value is not used. We use the FpGET_ST0 instructions 1500 // instead. 1501 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1502 // If we prefer to use the value in xmm registers, copy it out as f80 and 1503 // use a truncate to move it from fp stack reg to xmm reg. 1504 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1505 bool isST0 = VA.getLocReg() == X86::ST0; 1506 unsigned Opc = 0; 1507 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1508 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1509 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1510 SDValue Ops[] = { Chain, InFlag }; 1511 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue, 1512 Ops, 2), 1); 1513 Val = Chain.getValue(0); 1514 1515 // Round the f80 to the right size, which also moves it to the appropriate 1516 // xmm register. 1517 if (CopyVT != VA.getValVT()) 1518 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1519 // This truncation won't change the value. 1520 DAG.getIntPtrConstant(1)); 1521 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1522 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1523 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1524 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1525 MVT::v2i64, InFlag).getValue(1); 1526 Val = Chain.getValue(0); 1527 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1528 Val, DAG.getConstant(0, MVT::i64)); 1529 } else { 1530 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1531 MVT::i64, InFlag).getValue(1); 1532 Val = Chain.getValue(0); 1533 } 1534 Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); 1535 } else { 1536 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1537 CopyVT, InFlag).getValue(1); 1538 Val = Chain.getValue(0); 1539 } 1540 InFlag = Chain.getValue(2); 1541 InVals.push_back(Val); 1542 } 1543 1544 return Chain; 1545} 1546 1547 1548//===----------------------------------------------------------------------===// 1549// C & StdCall & Fast Calling Convention implementation 1550//===----------------------------------------------------------------------===// 1551// StdCall calling convention seems to be standard for many Windows' API 1552// routines and around. It differs from C calling convention just a little: 1553// callee should clean up the stack, not caller. Symbols should be also 1554// decorated in some fancy way :) It doesn't support any vector arguments. 1555// For info on fast calling convention see Fast Calling Convention (tail call) 1556// implementation LowerX86_32FastCCCallTo. 1557 1558/// CallIsStructReturn - Determines whether a call uses struct return 1559/// semantics. 1560static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1561 if (Outs.empty()) 1562 return false; 1563 1564 return Outs[0].Flags.isSRet(); 1565} 1566 1567/// ArgsAreStructReturn - Determines whether a function uses struct 1568/// return semantics. 1569static bool 1570ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1571 if (Ins.empty()) 1572 return false; 1573 1574 return Ins[0].Flags.isSRet(); 1575} 1576 1577/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1578/// by "Src" to address "Dst" with size and alignment information specified by 1579/// the specific parameter attribute. The copy will be passed as a byval 1580/// function parameter. 1581static SDValue 1582CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1583 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1584 DebugLoc dl) { 1585 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1586 1587 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1588 /*isVolatile*/false, /*AlwaysInline=*/true, 1589 MachinePointerInfo(), MachinePointerInfo()); 1590} 1591 1592/// IsTailCallConvention - Return true if the calling convention is one that 1593/// supports tail call optimization. 1594static bool IsTailCallConvention(CallingConv::ID CC) { 1595 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1596} 1597 1598/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1599/// a tailcall target by changing its ABI. 1600static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1601 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1602} 1603 1604SDValue 1605X86TargetLowering::LowerMemArgument(SDValue Chain, 1606 CallingConv::ID CallConv, 1607 const SmallVectorImpl<ISD::InputArg> &Ins, 1608 DebugLoc dl, SelectionDAG &DAG, 1609 const CCValAssign &VA, 1610 MachineFrameInfo *MFI, 1611 unsigned i) const { 1612 // Create the nodes corresponding to a load from this parameter slot. 1613 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1614 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1615 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1616 EVT ValVT; 1617 1618 // If value is passed by pointer we have address passed instead of the value 1619 // itself. 1620 if (VA.getLocInfo() == CCValAssign::Indirect) 1621 ValVT = VA.getLocVT(); 1622 else 1623 ValVT = VA.getValVT(); 1624 1625 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1626 // changed with more analysis. 1627 // In case of tail call optimization mark all arguments mutable. Since they 1628 // could be overwritten by lowering of arguments in case of a tail call. 1629 if (Flags.isByVal()) { 1630 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1631 VA.getLocMemOffset(), isImmutable); 1632 return DAG.getFrameIndex(FI, getPointerTy()); 1633 } else { 1634 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1635 VA.getLocMemOffset(), isImmutable); 1636 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1637 return DAG.getLoad(ValVT, dl, Chain, FIN, 1638 MachinePointerInfo::getFixedStack(FI), 1639 false, false, 0); 1640 } 1641} 1642 1643SDValue 1644X86TargetLowering::LowerFormalArguments(SDValue Chain, 1645 CallingConv::ID CallConv, 1646 bool isVarArg, 1647 const SmallVectorImpl<ISD::InputArg> &Ins, 1648 DebugLoc dl, 1649 SelectionDAG &DAG, 1650 SmallVectorImpl<SDValue> &InVals) 1651 const { 1652 MachineFunction &MF = DAG.getMachineFunction(); 1653 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1654 1655 const Function* Fn = MF.getFunction(); 1656 if (Fn->hasExternalLinkage() && 1657 Subtarget->isTargetCygMing() && 1658 Fn->getName() == "main") 1659 FuncInfo->setForceFramePointer(true); 1660 1661 MachineFrameInfo *MFI = MF.getFrameInfo(); 1662 bool Is64Bit = Subtarget->is64Bit(); 1663 bool IsWin64 = Subtarget->isTargetWin64(); 1664 1665 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1666 "Var args not supported with calling convention fastcc or ghc"); 1667 1668 // Assign locations to all of the incoming arguments. 1669 SmallVector<CCValAssign, 16> ArgLocs; 1670 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1671 ArgLocs, *DAG.getContext()); 1672 1673 // Allocate shadow area for Win64 1674 if (IsWin64) { 1675 CCInfo.AllocateStack(32, 8); 1676 } 1677 1678 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1679 1680 unsigned LastVal = ~0U; 1681 SDValue ArgValue; 1682 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1683 CCValAssign &VA = ArgLocs[i]; 1684 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1685 // places. 1686 assert(VA.getValNo() != LastVal && 1687 "Don't support value assigned to multiple locs yet"); 1688 LastVal = VA.getValNo(); 1689 1690 if (VA.isRegLoc()) { 1691 EVT RegVT = VA.getLocVT(); 1692 TargetRegisterClass *RC = NULL; 1693 if (RegVT == MVT::i32) 1694 RC = X86::GR32RegisterClass; 1695 else if (Is64Bit && RegVT == MVT::i64) 1696 RC = X86::GR64RegisterClass; 1697 else if (RegVT == MVT::f32) 1698 RC = X86::FR32RegisterClass; 1699 else if (RegVT == MVT::f64) 1700 RC = X86::FR64RegisterClass; 1701 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1702 RC = X86::VR256RegisterClass; 1703 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1704 RC = X86::VR128RegisterClass; 1705 else if (RegVT == MVT::x86mmx) 1706 RC = X86::VR64RegisterClass; 1707 else 1708 llvm_unreachable("Unknown argument type!"); 1709 1710 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1711 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1712 1713 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1714 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1715 // right size. 1716 if (VA.getLocInfo() == CCValAssign::SExt) 1717 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1718 DAG.getValueType(VA.getValVT())); 1719 else if (VA.getLocInfo() == CCValAssign::ZExt) 1720 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1721 DAG.getValueType(VA.getValVT())); 1722 else if (VA.getLocInfo() == CCValAssign::BCvt) 1723 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1724 1725 if (VA.isExtInLoc()) { 1726 // Handle MMX values passed in XMM regs. 1727 if (RegVT.isVector()) { 1728 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1729 ArgValue); 1730 } else 1731 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1732 } 1733 } else { 1734 assert(VA.isMemLoc()); 1735 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1736 } 1737 1738 // If value is passed via pointer - do a load. 1739 if (VA.getLocInfo() == CCValAssign::Indirect) 1740 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1741 MachinePointerInfo(), false, false, 0); 1742 1743 InVals.push_back(ArgValue); 1744 } 1745 1746 // The x86-64 ABI for returning structs by value requires that we copy 1747 // the sret argument into %rax for the return. Save the argument into 1748 // a virtual register so that we can access it from the return points. 1749 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1750 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1751 unsigned Reg = FuncInfo->getSRetReturnReg(); 1752 if (!Reg) { 1753 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1754 FuncInfo->setSRetReturnReg(Reg); 1755 } 1756 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1757 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1758 } 1759 1760 unsigned StackSize = CCInfo.getNextStackOffset(); 1761 // Align stack specially for tail calls. 1762 if (FuncIsMadeTailCallSafe(CallConv)) 1763 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1764 1765 // If the function takes variable number of arguments, make a frame index for 1766 // the start of the first vararg value... for expansion of llvm.va_start. 1767 if (isVarArg) { 1768 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1769 CallConv != CallingConv::X86_ThisCall)) { 1770 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1771 } 1772 if (Is64Bit) { 1773 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1774 1775 // FIXME: We should really autogenerate these arrays 1776 static const unsigned GPR64ArgRegsWin64[] = { 1777 X86::RCX, X86::RDX, X86::R8, X86::R9 1778 }; 1779 static const unsigned GPR64ArgRegs64Bit[] = { 1780 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1781 }; 1782 static const unsigned XMMArgRegs64Bit[] = { 1783 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1784 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1785 }; 1786 const unsigned *GPR64ArgRegs; 1787 unsigned NumXMMRegs = 0; 1788 1789 if (IsWin64) { 1790 // The XMM registers which might contain var arg parameters are shadowed 1791 // in their paired GPR. So we only need to save the GPR to their home 1792 // slots. 1793 TotalNumIntRegs = 4; 1794 GPR64ArgRegs = GPR64ArgRegsWin64; 1795 } else { 1796 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1797 GPR64ArgRegs = GPR64ArgRegs64Bit; 1798 1799 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1800 } 1801 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1802 TotalNumIntRegs); 1803 1804 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1805 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1806 "SSE register cannot be used when SSE is disabled!"); 1807 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1808 "SSE register cannot be used when SSE is disabled!"); 1809 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1810 // Kernel mode asks for SSE to be disabled, so don't push them 1811 // on the stack. 1812 TotalNumXMMRegs = 0; 1813 1814 if (IsWin64) { 1815 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1816 // Get to the caller-allocated home save location. Add 8 to account 1817 // for the return address. 1818 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1819 FuncInfo->setRegSaveFrameIndex( 1820 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1821 // Fixup to set vararg frame on shadow area (4 x i64). 1822 if (NumIntRegs < 4) 1823 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1824 } else { 1825 // For X86-64, if there are vararg parameters that are passed via 1826 // registers, then we must store them to their spots on the stack so they 1827 // may be loaded by deferencing the result of va_next. 1828 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1829 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1830 FuncInfo->setRegSaveFrameIndex( 1831 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1832 false)); 1833 } 1834 1835 // Store the integer parameter registers. 1836 SmallVector<SDValue, 8> MemOps; 1837 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1838 getPointerTy()); 1839 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1840 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1841 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1842 DAG.getIntPtrConstant(Offset)); 1843 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1844 X86::GR64RegisterClass); 1845 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1846 SDValue Store = 1847 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1848 MachinePointerInfo::getFixedStack( 1849 FuncInfo->getRegSaveFrameIndex(), Offset), 1850 false, false, 0); 1851 MemOps.push_back(Store); 1852 Offset += 8; 1853 } 1854 1855 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1856 // Now store the XMM (fp + vector) parameter registers. 1857 SmallVector<SDValue, 11> SaveXMMOps; 1858 SaveXMMOps.push_back(Chain); 1859 1860 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1861 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1862 SaveXMMOps.push_back(ALVal); 1863 1864 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1865 FuncInfo->getRegSaveFrameIndex())); 1866 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1867 FuncInfo->getVarArgsFPOffset())); 1868 1869 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1870 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1871 X86::VR128RegisterClass); 1872 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1873 SaveXMMOps.push_back(Val); 1874 } 1875 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1876 MVT::Other, 1877 &SaveXMMOps[0], SaveXMMOps.size())); 1878 } 1879 1880 if (!MemOps.empty()) 1881 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1882 &MemOps[0], MemOps.size()); 1883 } 1884 } 1885 1886 // Some CCs need callee pop. 1887 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1888 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1889 } else { 1890 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1891 // If this is an sret function, the return should pop the hidden pointer. 1892 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1893 FuncInfo->setBytesToPopOnReturn(4); 1894 } 1895 1896 if (!Is64Bit) { 1897 // RegSaveFrameIndex is X86-64 only. 1898 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1899 if (CallConv == CallingConv::X86_FastCall || 1900 CallConv == CallingConv::X86_ThisCall) 1901 // fastcc functions can't have varargs. 1902 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1903 } 1904 1905 return Chain; 1906} 1907 1908SDValue 1909X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1910 SDValue StackPtr, SDValue Arg, 1911 DebugLoc dl, SelectionDAG &DAG, 1912 const CCValAssign &VA, 1913 ISD::ArgFlagsTy Flags) const { 1914 unsigned LocMemOffset = VA.getLocMemOffset(); 1915 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1916 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1917 if (Flags.isByVal()) 1918 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1919 1920 return DAG.getStore(Chain, dl, Arg, PtrOff, 1921 MachinePointerInfo::getStack(LocMemOffset), 1922 false, false, 0); 1923} 1924 1925/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1926/// optimization is performed and it is required. 1927SDValue 1928X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1929 SDValue &OutRetAddr, SDValue Chain, 1930 bool IsTailCall, bool Is64Bit, 1931 int FPDiff, DebugLoc dl) const { 1932 // Adjust the Return address stack slot. 1933 EVT VT = getPointerTy(); 1934 OutRetAddr = getReturnAddressFrameIndex(DAG); 1935 1936 // Load the "old" Return address. 1937 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1938 false, false, 0); 1939 return SDValue(OutRetAddr.getNode(), 1); 1940} 1941 1942/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1943/// optimization is performed and it is required (FPDiff!=0). 1944static SDValue 1945EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1946 SDValue Chain, SDValue RetAddrFrIdx, 1947 bool Is64Bit, int FPDiff, DebugLoc dl) { 1948 // Store the return address to the appropriate stack slot. 1949 if (!FPDiff) return Chain; 1950 // Calculate the new stack slot for the return address. 1951 int SlotSize = Is64Bit ? 8 : 4; 1952 int NewReturnAddrFI = 1953 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1954 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1955 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1956 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1957 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1958 false, false, 0); 1959 return Chain; 1960} 1961 1962SDValue 1963X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1964 CallingConv::ID CallConv, bool isVarArg, 1965 bool &isTailCall, 1966 const SmallVectorImpl<ISD::OutputArg> &Outs, 1967 const SmallVectorImpl<SDValue> &OutVals, 1968 const SmallVectorImpl<ISD::InputArg> &Ins, 1969 DebugLoc dl, SelectionDAG &DAG, 1970 SmallVectorImpl<SDValue> &InVals) const { 1971 MachineFunction &MF = DAG.getMachineFunction(); 1972 bool Is64Bit = Subtarget->is64Bit(); 1973 bool IsWin64 = Subtarget->isTargetWin64(); 1974 bool IsStructRet = CallIsStructReturn(Outs); 1975 bool IsSibcall = false; 1976 1977 if (isTailCall) { 1978 // Check if it's really possible to do a tail call. 1979 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1980 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1981 Outs, OutVals, Ins, DAG); 1982 1983 // Sibcalls are automatically detected tailcalls which do not require 1984 // ABI changes. 1985 if (!GuaranteedTailCallOpt && isTailCall) 1986 IsSibcall = true; 1987 1988 if (isTailCall) 1989 ++NumTailCalls; 1990 } 1991 1992 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1993 "Var args not supported with calling convention fastcc or ghc"); 1994 1995 // Analyze operands of the call, assigning locations to each operand. 1996 SmallVector<CCValAssign, 16> ArgLocs; 1997 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1998 ArgLocs, *DAG.getContext()); 1999 2000 // Allocate shadow area for Win64 2001 if (IsWin64) { 2002 CCInfo.AllocateStack(32, 8); 2003 } 2004 2005 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2006 2007 // Get a count of how many bytes are to be pushed on the stack. 2008 unsigned NumBytes = CCInfo.getNextStackOffset(); 2009 if (IsSibcall) 2010 // This is a sibcall. The memory operands are available in caller's 2011 // own caller's stack. 2012 NumBytes = 0; 2013 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 2014 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2015 2016 int FPDiff = 0; 2017 if (isTailCall && !IsSibcall) { 2018 // Lower arguments at fp - stackoffset + fpdiff. 2019 unsigned NumBytesCallerPushed = 2020 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2021 FPDiff = NumBytesCallerPushed - NumBytes; 2022 2023 // Set the delta of movement of the returnaddr stackslot. 2024 // But only set if delta is greater than previous delta. 2025 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2026 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2027 } 2028 2029 if (!IsSibcall) 2030 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2031 2032 SDValue RetAddrFrIdx; 2033 // Load return adress for tail calls. 2034 if (isTailCall && FPDiff) 2035 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2036 Is64Bit, FPDiff, dl); 2037 2038 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2039 SmallVector<SDValue, 8> MemOpChains; 2040 SDValue StackPtr; 2041 2042 // Walk the register/memloc assignments, inserting copies/loads. In the case 2043 // of tail call optimization arguments are handle later. 2044 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2045 CCValAssign &VA = ArgLocs[i]; 2046 EVT RegVT = VA.getLocVT(); 2047 SDValue Arg = OutVals[i]; 2048 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2049 bool isByVal = Flags.isByVal(); 2050 2051 // Promote the value if needed. 2052 switch (VA.getLocInfo()) { 2053 default: llvm_unreachable("Unknown loc info!"); 2054 case CCValAssign::Full: break; 2055 case CCValAssign::SExt: 2056 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2057 break; 2058 case CCValAssign::ZExt: 2059 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2060 break; 2061 case CCValAssign::AExt: 2062 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2063 // Special case: passing MMX values in XMM registers. 2064 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2065 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2066 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2067 } else 2068 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2069 break; 2070 case CCValAssign::BCvt: 2071 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2072 break; 2073 case CCValAssign::Indirect: { 2074 // Store the argument. 2075 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2076 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2077 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2078 MachinePointerInfo::getFixedStack(FI), 2079 false, false, 0); 2080 Arg = SpillSlot; 2081 break; 2082 } 2083 } 2084 2085 if (VA.isRegLoc()) { 2086 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2087 if (isVarArg && IsWin64) { 2088 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2089 // shadow reg if callee is a varargs function. 2090 unsigned ShadowReg = 0; 2091 switch (VA.getLocReg()) { 2092 case X86::XMM0: ShadowReg = X86::RCX; break; 2093 case X86::XMM1: ShadowReg = X86::RDX; break; 2094 case X86::XMM2: ShadowReg = X86::R8; break; 2095 case X86::XMM3: ShadowReg = X86::R9; break; 2096 } 2097 if (ShadowReg) 2098 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2099 } 2100 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2101 assert(VA.isMemLoc()); 2102 if (StackPtr.getNode() == 0) 2103 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2104 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2105 dl, DAG, VA, Flags)); 2106 } 2107 } 2108 2109 if (!MemOpChains.empty()) 2110 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2111 &MemOpChains[0], MemOpChains.size()); 2112 2113 // Build a sequence of copy-to-reg nodes chained together with token chain 2114 // and flag operands which copy the outgoing args into registers. 2115 SDValue InFlag; 2116 // Tail call byval lowering might overwrite argument registers so in case of 2117 // tail call optimization the copies to registers are lowered later. 2118 if (!isTailCall) 2119 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2120 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2121 RegsToPass[i].second, InFlag); 2122 InFlag = Chain.getValue(1); 2123 } 2124 2125 if (Subtarget->isPICStyleGOT()) { 2126 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2127 // GOT pointer. 2128 if (!isTailCall) { 2129 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2130 DAG.getNode(X86ISD::GlobalBaseReg, 2131 DebugLoc(), getPointerTy()), 2132 InFlag); 2133 InFlag = Chain.getValue(1); 2134 } else { 2135 // If we are tail calling and generating PIC/GOT style code load the 2136 // address of the callee into ECX. The value in ecx is used as target of 2137 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2138 // for tail calls on PIC/GOT architectures. Normally we would just put the 2139 // address of GOT into ebx and then call target@PLT. But for tail calls 2140 // ebx would be restored (since ebx is callee saved) before jumping to the 2141 // target@PLT. 2142 2143 // Note: The actual moving to ECX is done further down. 2144 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2145 if (G && !G->getGlobal()->hasHiddenVisibility() && 2146 !G->getGlobal()->hasProtectedVisibility()) 2147 Callee = LowerGlobalAddress(Callee, DAG); 2148 else if (isa<ExternalSymbolSDNode>(Callee)) 2149 Callee = LowerExternalSymbol(Callee, DAG); 2150 } 2151 } 2152 2153 if (Is64Bit && isVarArg && !IsWin64) { 2154 // From AMD64 ABI document: 2155 // For calls that may call functions that use varargs or stdargs 2156 // (prototype-less calls or calls to functions containing ellipsis (...) in 2157 // the declaration) %al is used as hidden argument to specify the number 2158 // of SSE registers used. The contents of %al do not need to match exactly 2159 // the number of registers, but must be an ubound on the number of SSE 2160 // registers used and is in the range 0 - 8 inclusive. 2161 2162 // Count the number of XMM registers allocated. 2163 static const unsigned XMMArgRegs[] = { 2164 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2165 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2166 }; 2167 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2168 assert((Subtarget->hasXMM() || !NumXMMRegs) 2169 && "SSE registers cannot be used when SSE is disabled"); 2170 2171 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2172 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2173 InFlag = Chain.getValue(1); 2174 } 2175 2176 2177 // For tail calls lower the arguments to the 'real' stack slot. 2178 if (isTailCall) { 2179 // Force all the incoming stack arguments to be loaded from the stack 2180 // before any new outgoing arguments are stored to the stack, because the 2181 // outgoing stack slots may alias the incoming argument stack slots, and 2182 // the alias isn't otherwise explicit. This is slightly more conservative 2183 // than necessary, because it means that each store effectively depends 2184 // on every argument instead of just those arguments it would clobber. 2185 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2186 2187 SmallVector<SDValue, 8> MemOpChains2; 2188 SDValue FIN; 2189 int FI = 0; 2190 // Do not flag preceeding copytoreg stuff together with the following stuff. 2191 InFlag = SDValue(); 2192 if (GuaranteedTailCallOpt) { 2193 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2194 CCValAssign &VA = ArgLocs[i]; 2195 if (VA.isRegLoc()) 2196 continue; 2197 assert(VA.isMemLoc()); 2198 SDValue Arg = OutVals[i]; 2199 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2200 // Create frame index. 2201 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2202 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2203 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2204 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2205 2206 if (Flags.isByVal()) { 2207 // Copy relative to framepointer. 2208 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2209 if (StackPtr.getNode() == 0) 2210 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2211 getPointerTy()); 2212 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2213 2214 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2215 ArgChain, 2216 Flags, DAG, dl)); 2217 } else { 2218 // Store relative to framepointer. 2219 MemOpChains2.push_back( 2220 DAG.getStore(ArgChain, dl, Arg, FIN, 2221 MachinePointerInfo::getFixedStack(FI), 2222 false, false, 0)); 2223 } 2224 } 2225 } 2226 2227 if (!MemOpChains2.empty()) 2228 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2229 &MemOpChains2[0], MemOpChains2.size()); 2230 2231 // Copy arguments to their registers. 2232 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2233 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2234 RegsToPass[i].second, InFlag); 2235 InFlag = Chain.getValue(1); 2236 } 2237 InFlag =SDValue(); 2238 2239 // Store the return address to the appropriate stack slot. 2240 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2241 FPDiff, dl); 2242 } 2243 2244 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2245 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2246 // In the 64-bit large code model, we have to make all calls 2247 // through a register, since the call instruction's 32-bit 2248 // pc-relative offset may not be large enough to hold the whole 2249 // address. 2250 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2251 // If the callee is a GlobalAddress node (quite common, every direct call 2252 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2253 // it. 2254 2255 // We should use extra load for direct calls to dllimported functions in 2256 // non-JIT mode. 2257 const GlobalValue *GV = G->getGlobal(); 2258 if (!GV->hasDLLImportLinkage()) { 2259 unsigned char OpFlags = 0; 2260 2261 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2262 // external symbols most go through the PLT in PIC mode. If the symbol 2263 // has hidden or protected visibility, or if it is static or local, then 2264 // we don't need to use the PLT - we can directly call it. 2265 if (Subtarget->isTargetELF() && 2266 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2267 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2268 OpFlags = X86II::MO_PLT; 2269 } else if (Subtarget->isPICStyleStubAny() && 2270 (GV->isDeclaration() || GV->isWeakForLinker()) && 2271 Subtarget->getDarwinVers() < 9) { 2272 // PC-relative references to external symbols should go through $stub, 2273 // unless we're building with the leopard linker or later, which 2274 // automatically synthesizes these stubs. 2275 OpFlags = X86II::MO_DARWIN_STUB; 2276 } 2277 2278 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2279 G->getOffset(), OpFlags); 2280 } 2281 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2282 unsigned char OpFlags = 0; 2283 2284 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2285 // external symbols should go through the PLT. 2286 if (Subtarget->isTargetELF() && 2287 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2288 OpFlags = X86II::MO_PLT; 2289 } else if (Subtarget->isPICStyleStubAny() && 2290 Subtarget->getDarwinVers() < 9) { 2291 // PC-relative references to external symbols should go through $stub, 2292 // unless we're building with the leopard linker or later, which 2293 // automatically synthesizes these stubs. 2294 OpFlags = X86II::MO_DARWIN_STUB; 2295 } 2296 2297 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2298 OpFlags); 2299 } 2300 2301 // Returns a chain & a flag for retval copy to use. 2302 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2303 SmallVector<SDValue, 8> Ops; 2304 2305 if (!IsSibcall && isTailCall) { 2306 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2307 DAG.getIntPtrConstant(0, true), InFlag); 2308 InFlag = Chain.getValue(1); 2309 } 2310 2311 Ops.push_back(Chain); 2312 Ops.push_back(Callee); 2313 2314 if (isTailCall) 2315 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2316 2317 // Add argument registers to the end of the list so that they are known live 2318 // into the call. 2319 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2320 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2321 RegsToPass[i].second.getValueType())); 2322 2323 // Add an implicit use GOT pointer in EBX. 2324 if (!isTailCall && Subtarget->isPICStyleGOT()) 2325 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2326 2327 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2328 if (Is64Bit && isVarArg && !IsWin64) 2329 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2330 2331 if (InFlag.getNode()) 2332 Ops.push_back(InFlag); 2333 2334 if (isTailCall) { 2335 // We used to do: 2336 //// If this is the first return lowered for this function, add the regs 2337 //// to the liveout set for the function. 2338 // This isn't right, although it's probably harmless on x86; liveouts 2339 // should be computed from returns not tail calls. Consider a void 2340 // function making a tail call to a function returning int. 2341 return DAG.getNode(X86ISD::TC_RETURN, dl, 2342 NodeTys, &Ops[0], Ops.size()); 2343 } 2344 2345 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2346 InFlag = Chain.getValue(1); 2347 2348 // Create the CALLSEQ_END node. 2349 unsigned NumBytesForCalleeToPush; 2350 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2351 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2352 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2353 // If this is a call to a struct-return function, the callee 2354 // pops the hidden struct pointer, so we have to push it back. 2355 // This is common for Darwin/X86, Linux & Mingw32 targets. 2356 NumBytesForCalleeToPush = 4; 2357 else 2358 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2359 2360 // Returns a flag for retval copy to use. 2361 if (!IsSibcall) { 2362 Chain = DAG.getCALLSEQ_END(Chain, 2363 DAG.getIntPtrConstant(NumBytes, true), 2364 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2365 true), 2366 InFlag); 2367 InFlag = Chain.getValue(1); 2368 } 2369 2370 // Handle result values, copying them out of physregs into vregs that we 2371 // return. 2372 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2373 Ins, dl, DAG, InVals); 2374} 2375 2376 2377//===----------------------------------------------------------------------===// 2378// Fast Calling Convention (tail call) implementation 2379//===----------------------------------------------------------------------===// 2380 2381// Like std call, callee cleans arguments, convention except that ECX is 2382// reserved for storing the tail called function address. Only 2 registers are 2383// free for argument passing (inreg). Tail call optimization is performed 2384// provided: 2385// * tailcallopt is enabled 2386// * caller/callee are fastcc 2387// On X86_64 architecture with GOT-style position independent code only local 2388// (within module) calls are supported at the moment. 2389// To keep the stack aligned according to platform abi the function 2390// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2391// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2392// If a tail called function callee has more arguments than the caller the 2393// caller needs to make sure that there is room to move the RETADDR to. This is 2394// achieved by reserving an area the size of the argument delta right after the 2395// original REtADDR, but before the saved framepointer or the spilled registers 2396// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2397// stack layout: 2398// arg1 2399// arg2 2400// RETADDR 2401// [ new RETADDR 2402// move area ] 2403// (possible EBP) 2404// ESI 2405// EDI 2406// local1 .. 2407 2408/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2409/// for a 16 byte align requirement. 2410unsigned 2411X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2412 SelectionDAG& DAG) const { 2413 MachineFunction &MF = DAG.getMachineFunction(); 2414 const TargetMachine &TM = MF.getTarget(); 2415 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2416 unsigned StackAlignment = TFI.getStackAlignment(); 2417 uint64_t AlignMask = StackAlignment - 1; 2418 int64_t Offset = StackSize; 2419 uint64_t SlotSize = TD->getPointerSize(); 2420 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2421 // Number smaller than 12 so just add the difference. 2422 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2423 } else { 2424 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2425 Offset = ((~AlignMask) & Offset) + StackAlignment + 2426 (StackAlignment-SlotSize); 2427 } 2428 return Offset; 2429} 2430 2431/// MatchingStackOffset - Return true if the given stack call argument is 2432/// already available in the same position (relatively) of the caller's 2433/// incoming argument stack. 2434static 2435bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2436 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2437 const X86InstrInfo *TII) { 2438 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2439 int FI = INT_MAX; 2440 if (Arg.getOpcode() == ISD::CopyFromReg) { 2441 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2442 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2443 return false; 2444 MachineInstr *Def = MRI->getVRegDef(VR); 2445 if (!Def) 2446 return false; 2447 if (!Flags.isByVal()) { 2448 if (!TII->isLoadFromStackSlot(Def, FI)) 2449 return false; 2450 } else { 2451 unsigned Opcode = Def->getOpcode(); 2452 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2453 Def->getOperand(1).isFI()) { 2454 FI = Def->getOperand(1).getIndex(); 2455 Bytes = Flags.getByValSize(); 2456 } else 2457 return false; 2458 } 2459 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2460 if (Flags.isByVal()) 2461 // ByVal argument is passed in as a pointer but it's now being 2462 // dereferenced. e.g. 2463 // define @foo(%struct.X* %A) { 2464 // tail call @bar(%struct.X* byval %A) 2465 // } 2466 return false; 2467 SDValue Ptr = Ld->getBasePtr(); 2468 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2469 if (!FINode) 2470 return false; 2471 FI = FINode->getIndex(); 2472 } else 2473 return false; 2474 2475 assert(FI != INT_MAX); 2476 if (!MFI->isFixedObjectIndex(FI)) 2477 return false; 2478 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2479} 2480 2481/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2482/// for tail call optimization. Targets which want to do tail call 2483/// optimization should implement this function. 2484bool 2485X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2486 CallingConv::ID CalleeCC, 2487 bool isVarArg, 2488 bool isCalleeStructRet, 2489 bool isCallerStructRet, 2490 const SmallVectorImpl<ISD::OutputArg> &Outs, 2491 const SmallVectorImpl<SDValue> &OutVals, 2492 const SmallVectorImpl<ISD::InputArg> &Ins, 2493 SelectionDAG& DAG) const { 2494 if (!IsTailCallConvention(CalleeCC) && 2495 CalleeCC != CallingConv::C) 2496 return false; 2497 2498 // If -tailcallopt is specified, make fastcc functions tail-callable. 2499 const MachineFunction &MF = DAG.getMachineFunction(); 2500 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2501 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2502 bool CCMatch = CallerCC == CalleeCC; 2503 2504 if (GuaranteedTailCallOpt) { 2505 if (IsTailCallConvention(CalleeCC) && CCMatch) 2506 return true; 2507 return false; 2508 } 2509 2510 // Look for obvious safe cases to perform tail call optimization that do not 2511 // require ABI changes. This is what gcc calls sibcall. 2512 2513 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2514 // emit a special epilogue. 2515 if (RegInfo->needsStackRealignment(MF)) 2516 return false; 2517 2518 // Do not sibcall optimize vararg calls unless the call site is not passing 2519 // any arguments. 2520 if (isVarArg && !Outs.empty()) 2521 return false; 2522 2523 // Also avoid sibcall optimization if either caller or callee uses struct 2524 // return semantics. 2525 if (isCalleeStructRet || isCallerStructRet) 2526 return false; 2527 2528 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2529 // Therefore if it's not used by the call it is not safe to optimize this into 2530 // a sibcall. 2531 bool Unused = false; 2532 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2533 if (!Ins[i].Used) { 2534 Unused = true; 2535 break; 2536 } 2537 } 2538 if (Unused) { 2539 SmallVector<CCValAssign, 16> RVLocs; 2540 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2541 RVLocs, *DAG.getContext()); 2542 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2543 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2544 CCValAssign &VA = RVLocs[i]; 2545 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2546 return false; 2547 } 2548 } 2549 2550 // If the calling conventions do not match, then we'd better make sure the 2551 // results are returned in the same way as what the caller expects. 2552 if (!CCMatch) { 2553 SmallVector<CCValAssign, 16> RVLocs1; 2554 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2555 RVLocs1, *DAG.getContext()); 2556 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2557 2558 SmallVector<CCValAssign, 16> RVLocs2; 2559 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2560 RVLocs2, *DAG.getContext()); 2561 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2562 2563 if (RVLocs1.size() != RVLocs2.size()) 2564 return false; 2565 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2566 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2567 return false; 2568 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2569 return false; 2570 if (RVLocs1[i].isRegLoc()) { 2571 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2572 return false; 2573 } else { 2574 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2575 return false; 2576 } 2577 } 2578 } 2579 2580 // If the callee takes no arguments then go on to check the results of the 2581 // call. 2582 if (!Outs.empty()) { 2583 // Check if stack adjustment is needed. For now, do not do this if any 2584 // argument is passed on the stack. 2585 SmallVector<CCValAssign, 16> ArgLocs; 2586 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2587 ArgLocs, *DAG.getContext()); 2588 2589 // Allocate shadow area for Win64 2590 if (Subtarget->isTargetWin64()) { 2591 CCInfo.AllocateStack(32, 8); 2592 } 2593 2594 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2595 if (CCInfo.getNextStackOffset()) { 2596 MachineFunction &MF = DAG.getMachineFunction(); 2597 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2598 return false; 2599 2600 // Check if the arguments are already laid out in the right way as 2601 // the caller's fixed stack objects. 2602 MachineFrameInfo *MFI = MF.getFrameInfo(); 2603 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2604 const X86InstrInfo *TII = 2605 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2606 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2607 CCValAssign &VA = ArgLocs[i]; 2608 SDValue Arg = OutVals[i]; 2609 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2610 if (VA.getLocInfo() == CCValAssign::Indirect) 2611 return false; 2612 if (!VA.isRegLoc()) { 2613 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2614 MFI, MRI, TII)) 2615 return false; 2616 } 2617 } 2618 } 2619 2620 // If the tailcall address may be in a register, then make sure it's 2621 // possible to register allocate for it. In 32-bit, the call address can 2622 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2623 // callee-saved registers are restored. These happen to be the same 2624 // registers used to pass 'inreg' arguments so watch out for those. 2625 if (!Subtarget->is64Bit() && 2626 !isa<GlobalAddressSDNode>(Callee) && 2627 !isa<ExternalSymbolSDNode>(Callee)) { 2628 unsigned NumInRegs = 0; 2629 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2630 CCValAssign &VA = ArgLocs[i]; 2631 if (!VA.isRegLoc()) 2632 continue; 2633 unsigned Reg = VA.getLocReg(); 2634 switch (Reg) { 2635 default: break; 2636 case X86::EAX: case X86::EDX: case X86::ECX: 2637 if (++NumInRegs == 3) 2638 return false; 2639 break; 2640 } 2641 } 2642 } 2643 } 2644 2645 // An stdcall caller is expected to clean up its arguments; the callee 2646 // isn't going to do that. 2647 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2648 return false; 2649 2650 return true; 2651} 2652 2653FastISel * 2654X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2655 return X86::createFastISel(funcInfo); 2656} 2657 2658 2659//===----------------------------------------------------------------------===// 2660// Other Lowering Hooks 2661//===----------------------------------------------------------------------===// 2662 2663static bool MayFoldLoad(SDValue Op) { 2664 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2665} 2666 2667static bool MayFoldIntoStore(SDValue Op) { 2668 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2669} 2670 2671static bool isTargetShuffle(unsigned Opcode) { 2672 switch(Opcode) { 2673 default: return false; 2674 case X86ISD::PSHUFD: 2675 case X86ISD::PSHUFHW: 2676 case X86ISD::PSHUFLW: 2677 case X86ISD::SHUFPD: 2678 case X86ISD::PALIGN: 2679 case X86ISD::SHUFPS: 2680 case X86ISD::MOVLHPS: 2681 case X86ISD::MOVLHPD: 2682 case X86ISD::MOVHLPS: 2683 case X86ISD::MOVLPS: 2684 case X86ISD::MOVLPD: 2685 case X86ISD::MOVSHDUP: 2686 case X86ISD::MOVSLDUP: 2687 case X86ISD::MOVDDUP: 2688 case X86ISD::MOVSS: 2689 case X86ISD::MOVSD: 2690 case X86ISD::UNPCKLPS: 2691 case X86ISD::UNPCKLPD: 2692 case X86ISD::VUNPCKLPS: 2693 case X86ISD::VUNPCKLPD: 2694 case X86ISD::VUNPCKLPSY: 2695 case X86ISD::VUNPCKLPDY: 2696 case X86ISD::PUNPCKLWD: 2697 case X86ISD::PUNPCKLBW: 2698 case X86ISD::PUNPCKLDQ: 2699 case X86ISD::PUNPCKLQDQ: 2700 case X86ISD::UNPCKHPS: 2701 case X86ISD::UNPCKHPD: 2702 case X86ISD::PUNPCKHWD: 2703 case X86ISD::PUNPCKHBW: 2704 case X86ISD::PUNPCKHDQ: 2705 case X86ISD::PUNPCKHQDQ: 2706 return true; 2707 } 2708 return false; 2709} 2710 2711static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2712 SDValue V1, SelectionDAG &DAG) { 2713 switch(Opc) { 2714 default: llvm_unreachable("Unknown x86 shuffle node"); 2715 case X86ISD::MOVSHDUP: 2716 case X86ISD::MOVSLDUP: 2717 case X86ISD::MOVDDUP: 2718 return DAG.getNode(Opc, dl, VT, V1); 2719 } 2720 2721 return SDValue(); 2722} 2723 2724static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2725 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2726 switch(Opc) { 2727 default: llvm_unreachable("Unknown x86 shuffle node"); 2728 case X86ISD::PSHUFD: 2729 case X86ISD::PSHUFHW: 2730 case X86ISD::PSHUFLW: 2731 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2732 } 2733 2734 return SDValue(); 2735} 2736 2737static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2738 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2739 switch(Opc) { 2740 default: llvm_unreachable("Unknown x86 shuffle node"); 2741 case X86ISD::PALIGN: 2742 case X86ISD::SHUFPD: 2743 case X86ISD::SHUFPS: 2744 return DAG.getNode(Opc, dl, VT, V1, V2, 2745 DAG.getConstant(TargetMask, MVT::i8)); 2746 } 2747 return SDValue(); 2748} 2749 2750static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2751 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2752 switch(Opc) { 2753 default: llvm_unreachable("Unknown x86 shuffle node"); 2754 case X86ISD::MOVLHPS: 2755 case X86ISD::MOVLHPD: 2756 case X86ISD::MOVHLPS: 2757 case X86ISD::MOVLPS: 2758 case X86ISD::MOVLPD: 2759 case X86ISD::MOVSS: 2760 case X86ISD::MOVSD: 2761 case X86ISD::UNPCKLPS: 2762 case X86ISD::UNPCKLPD: 2763 case X86ISD::VUNPCKLPS: 2764 case X86ISD::VUNPCKLPD: 2765 case X86ISD::VUNPCKLPSY: 2766 case X86ISD::VUNPCKLPDY: 2767 case X86ISD::PUNPCKLWD: 2768 case X86ISD::PUNPCKLBW: 2769 case X86ISD::PUNPCKLDQ: 2770 case X86ISD::PUNPCKLQDQ: 2771 case X86ISD::UNPCKHPS: 2772 case X86ISD::UNPCKHPD: 2773 case X86ISD::PUNPCKHWD: 2774 case X86ISD::PUNPCKHBW: 2775 case X86ISD::PUNPCKHDQ: 2776 case X86ISD::PUNPCKHQDQ: 2777 return DAG.getNode(Opc, dl, VT, V1, V2); 2778 } 2779 return SDValue(); 2780} 2781 2782SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2783 MachineFunction &MF = DAG.getMachineFunction(); 2784 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2785 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2786 2787 if (ReturnAddrIndex == 0) { 2788 // Set up a frame object for the return address. 2789 uint64_t SlotSize = TD->getPointerSize(); 2790 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2791 false); 2792 FuncInfo->setRAIndex(ReturnAddrIndex); 2793 } 2794 2795 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2796} 2797 2798 2799bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2800 bool hasSymbolicDisplacement) { 2801 // Offset should fit into 32 bit immediate field. 2802 if (!isInt<32>(Offset)) 2803 return false; 2804 2805 // If we don't have a symbolic displacement - we don't have any extra 2806 // restrictions. 2807 if (!hasSymbolicDisplacement) 2808 return true; 2809 2810 // FIXME: Some tweaks might be needed for medium code model. 2811 if (M != CodeModel::Small && M != CodeModel::Kernel) 2812 return false; 2813 2814 // For small code model we assume that latest object is 16MB before end of 31 2815 // bits boundary. We may also accept pretty large negative constants knowing 2816 // that all objects are in the positive half of address space. 2817 if (M == CodeModel::Small && Offset < 16*1024*1024) 2818 return true; 2819 2820 // For kernel code model we know that all object resist in the negative half 2821 // of 32bits address space. We may not accept negative offsets, since they may 2822 // be just off and we may accept pretty large positive ones. 2823 if (M == CodeModel::Kernel && Offset > 0) 2824 return true; 2825 2826 return false; 2827} 2828 2829/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2830/// specific condition code, returning the condition code and the LHS/RHS of the 2831/// comparison to make. 2832static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2833 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2834 if (!isFP) { 2835 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2836 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2837 // X > -1 -> X == 0, jump !sign. 2838 RHS = DAG.getConstant(0, RHS.getValueType()); 2839 return X86::COND_NS; 2840 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2841 // X < 0 -> X == 0, jump on sign. 2842 return X86::COND_S; 2843 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2844 // X < 1 -> X <= 0 2845 RHS = DAG.getConstant(0, RHS.getValueType()); 2846 return X86::COND_LE; 2847 } 2848 } 2849 2850 switch (SetCCOpcode) { 2851 default: llvm_unreachable("Invalid integer condition!"); 2852 case ISD::SETEQ: return X86::COND_E; 2853 case ISD::SETGT: return X86::COND_G; 2854 case ISD::SETGE: return X86::COND_GE; 2855 case ISD::SETLT: return X86::COND_L; 2856 case ISD::SETLE: return X86::COND_LE; 2857 case ISD::SETNE: return X86::COND_NE; 2858 case ISD::SETULT: return X86::COND_B; 2859 case ISD::SETUGT: return X86::COND_A; 2860 case ISD::SETULE: return X86::COND_BE; 2861 case ISD::SETUGE: return X86::COND_AE; 2862 } 2863 } 2864 2865 // First determine if it is required or is profitable to flip the operands. 2866 2867 // If LHS is a foldable load, but RHS is not, flip the condition. 2868 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2869 !ISD::isNON_EXTLoad(RHS.getNode())) { 2870 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2871 std::swap(LHS, RHS); 2872 } 2873 2874 switch (SetCCOpcode) { 2875 default: break; 2876 case ISD::SETOLT: 2877 case ISD::SETOLE: 2878 case ISD::SETUGT: 2879 case ISD::SETUGE: 2880 std::swap(LHS, RHS); 2881 break; 2882 } 2883 2884 // On a floating point condition, the flags are set as follows: 2885 // ZF PF CF op 2886 // 0 | 0 | 0 | X > Y 2887 // 0 | 0 | 1 | X < Y 2888 // 1 | 0 | 0 | X == Y 2889 // 1 | 1 | 1 | unordered 2890 switch (SetCCOpcode) { 2891 default: llvm_unreachable("Condcode should be pre-legalized away"); 2892 case ISD::SETUEQ: 2893 case ISD::SETEQ: return X86::COND_E; 2894 case ISD::SETOLT: // flipped 2895 case ISD::SETOGT: 2896 case ISD::SETGT: return X86::COND_A; 2897 case ISD::SETOLE: // flipped 2898 case ISD::SETOGE: 2899 case ISD::SETGE: return X86::COND_AE; 2900 case ISD::SETUGT: // flipped 2901 case ISD::SETULT: 2902 case ISD::SETLT: return X86::COND_B; 2903 case ISD::SETUGE: // flipped 2904 case ISD::SETULE: 2905 case ISD::SETLE: return X86::COND_BE; 2906 case ISD::SETONE: 2907 case ISD::SETNE: return X86::COND_NE; 2908 case ISD::SETUO: return X86::COND_P; 2909 case ISD::SETO: return X86::COND_NP; 2910 case ISD::SETOEQ: 2911 case ISD::SETUNE: return X86::COND_INVALID; 2912 } 2913} 2914 2915/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2916/// code. Current x86 isa includes the following FP cmov instructions: 2917/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2918static bool hasFPCMov(unsigned X86CC) { 2919 switch (X86CC) { 2920 default: 2921 return false; 2922 case X86::COND_B: 2923 case X86::COND_BE: 2924 case X86::COND_E: 2925 case X86::COND_P: 2926 case X86::COND_A: 2927 case X86::COND_AE: 2928 case X86::COND_NE: 2929 case X86::COND_NP: 2930 return true; 2931 } 2932} 2933 2934/// isFPImmLegal - Returns true if the target can instruction select the 2935/// specified FP immediate natively. If false, the legalizer will 2936/// materialize the FP immediate as a load from a constant pool. 2937bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2938 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2939 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2940 return true; 2941 } 2942 return false; 2943} 2944 2945/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2946/// the specified range (L, H]. 2947static bool isUndefOrInRange(int Val, int Low, int Hi) { 2948 return (Val < 0) || (Val >= Low && Val < Hi); 2949} 2950 2951/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2952/// specified value. 2953static bool isUndefOrEqual(int Val, int CmpVal) { 2954 if (Val < 0 || Val == CmpVal) 2955 return true; 2956 return false; 2957} 2958 2959/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2960/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2961/// the second operand. 2962static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2963 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2964 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2965 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2966 return (Mask[0] < 2 && Mask[1] < 2); 2967 return false; 2968} 2969 2970bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2971 SmallVector<int, 8> M; 2972 N->getMask(M); 2973 return ::isPSHUFDMask(M, N->getValueType(0)); 2974} 2975 2976/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2977/// is suitable for input to PSHUFHW. 2978static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2979 if (VT != MVT::v8i16) 2980 return false; 2981 2982 // Lower quadword copied in order or undef. 2983 for (int i = 0; i != 4; ++i) 2984 if (Mask[i] >= 0 && Mask[i] != i) 2985 return false; 2986 2987 // Upper quadword shuffled. 2988 for (int i = 4; i != 8; ++i) 2989 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2990 return false; 2991 2992 return true; 2993} 2994 2995bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2996 SmallVector<int, 8> M; 2997 N->getMask(M); 2998 return ::isPSHUFHWMask(M, N->getValueType(0)); 2999} 3000 3001/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3002/// is suitable for input to PSHUFLW. 3003static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3004 if (VT != MVT::v8i16) 3005 return false; 3006 3007 // Upper quadword copied in order. 3008 for (int i = 4; i != 8; ++i) 3009 if (Mask[i] >= 0 && Mask[i] != i) 3010 return false; 3011 3012 // Lower quadword shuffled. 3013 for (int i = 0; i != 4; ++i) 3014 if (Mask[i] >= 4) 3015 return false; 3016 3017 return true; 3018} 3019 3020bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3021 SmallVector<int, 8> M; 3022 N->getMask(M); 3023 return ::isPSHUFLWMask(M, N->getValueType(0)); 3024} 3025 3026/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3027/// is suitable for input to PALIGNR. 3028static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3029 bool hasSSSE3) { 3030 int i, e = VT.getVectorNumElements(); 3031 3032 // Do not handle v2i64 / v2f64 shuffles with palignr. 3033 if (e < 4 || !hasSSSE3) 3034 return false; 3035 3036 for (i = 0; i != e; ++i) 3037 if (Mask[i] >= 0) 3038 break; 3039 3040 // All undef, not a palignr. 3041 if (i == e) 3042 return false; 3043 3044 // Determine if it's ok to perform a palignr with only the LHS, since we 3045 // don't have access to the actual shuffle elements to see if RHS is undef. 3046 bool Unary = Mask[i] < (int)e; 3047 bool NeedsUnary = false; 3048 3049 int s = Mask[i] - i; 3050 3051 // Check the rest of the elements to see if they are consecutive. 3052 for (++i; i != e; ++i) { 3053 int m = Mask[i]; 3054 if (m < 0) 3055 continue; 3056 3057 Unary = Unary && (m < (int)e); 3058 NeedsUnary = NeedsUnary || (m < s); 3059 3060 if (NeedsUnary && !Unary) 3061 return false; 3062 if (Unary && m != ((s+i) & (e-1))) 3063 return false; 3064 if (!Unary && m != (s+i)) 3065 return false; 3066 } 3067 return true; 3068} 3069 3070bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 3071 SmallVector<int, 8> M; 3072 N->getMask(M); 3073 return ::isPALIGNRMask(M, N->getValueType(0), true); 3074} 3075 3076/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3077/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3078static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3079 int NumElems = VT.getVectorNumElements(); 3080 if (NumElems != 2 && NumElems != 4) 3081 return false; 3082 3083 int Half = NumElems / 2; 3084 for (int i = 0; i < Half; ++i) 3085 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3086 return false; 3087 for (int i = Half; i < NumElems; ++i) 3088 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3089 return false; 3090 3091 return true; 3092} 3093 3094bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3095 SmallVector<int, 8> M; 3096 N->getMask(M); 3097 return ::isSHUFPMask(M, N->getValueType(0)); 3098} 3099 3100/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3101/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3102/// half elements to come from vector 1 (which would equal the dest.) and 3103/// the upper half to come from vector 2. 3104static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3105 int NumElems = VT.getVectorNumElements(); 3106 3107 if (NumElems != 2 && NumElems != 4) 3108 return false; 3109 3110 int Half = NumElems / 2; 3111 for (int i = 0; i < Half; ++i) 3112 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3113 return false; 3114 for (int i = Half; i < NumElems; ++i) 3115 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3116 return false; 3117 return true; 3118} 3119 3120static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3121 SmallVector<int, 8> M; 3122 N->getMask(M); 3123 return isCommutedSHUFPMask(M, N->getValueType(0)); 3124} 3125 3126/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3127/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3128bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3129 if (N->getValueType(0).getVectorNumElements() != 4) 3130 return false; 3131 3132 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3133 return isUndefOrEqual(N->getMaskElt(0), 6) && 3134 isUndefOrEqual(N->getMaskElt(1), 7) && 3135 isUndefOrEqual(N->getMaskElt(2), 2) && 3136 isUndefOrEqual(N->getMaskElt(3), 3); 3137} 3138 3139/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3140/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3141/// <2, 3, 2, 3> 3142bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3143 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3144 3145 if (NumElems != 4) 3146 return false; 3147 3148 return isUndefOrEqual(N->getMaskElt(0), 2) && 3149 isUndefOrEqual(N->getMaskElt(1), 3) && 3150 isUndefOrEqual(N->getMaskElt(2), 2) && 3151 isUndefOrEqual(N->getMaskElt(3), 3); 3152} 3153 3154/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3155/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3156bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3157 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3158 3159 if (NumElems != 2 && NumElems != 4) 3160 return false; 3161 3162 for (unsigned i = 0; i < NumElems/2; ++i) 3163 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3164 return false; 3165 3166 for (unsigned i = NumElems/2; i < NumElems; ++i) 3167 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3168 return false; 3169 3170 return true; 3171} 3172 3173/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3174/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3175bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3176 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3177 3178 if ((NumElems != 2 && NumElems != 4) 3179 || N->getValueType(0).getSizeInBits() > 128) 3180 return false; 3181 3182 for (unsigned i = 0; i < NumElems/2; ++i) 3183 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3184 return false; 3185 3186 for (unsigned i = 0; i < NumElems/2; ++i) 3187 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3188 return false; 3189 3190 return true; 3191} 3192 3193/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3194/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3195static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3196 bool V2IsSplat = false) { 3197 int NumElts = VT.getVectorNumElements(); 3198 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3199 return false; 3200 3201 // Handle vector lengths > 128 bits. Define a "section" as a set of 3202 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit 3203 // sections. 3204 unsigned NumSections = VT.getSizeInBits() / 128; 3205 if (NumSections == 0 ) NumSections = 1; // Handle MMX 3206 unsigned NumSectionElts = NumElts / NumSections; 3207 3208 unsigned Start = 0; 3209 unsigned End = NumSectionElts; 3210 for (unsigned s = 0; s < NumSections; ++s) { 3211 for (unsigned i = Start, j = s * NumSectionElts; 3212 i != End; 3213 i += 2, ++j) { 3214 int BitI = Mask[i]; 3215 int BitI1 = Mask[i+1]; 3216 if (!isUndefOrEqual(BitI, j)) 3217 return false; 3218 if (V2IsSplat) { 3219 if (!isUndefOrEqual(BitI1, NumElts)) 3220 return false; 3221 } else { 3222 if (!isUndefOrEqual(BitI1, j + NumElts)) 3223 return false; 3224 } 3225 } 3226 // Process the next 128 bits. 3227 Start += NumSectionElts; 3228 End += NumSectionElts; 3229 } 3230 3231 return true; 3232} 3233 3234bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3235 SmallVector<int, 8> M; 3236 N->getMask(M); 3237 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3238} 3239 3240/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3241/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3242static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3243 bool V2IsSplat = false) { 3244 int NumElts = VT.getVectorNumElements(); 3245 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3246 return false; 3247 3248 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3249 int BitI = Mask[i]; 3250 int BitI1 = Mask[i+1]; 3251 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3252 return false; 3253 if (V2IsSplat) { 3254 if (isUndefOrEqual(BitI1, NumElts)) 3255 return false; 3256 } else { 3257 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3258 return false; 3259 } 3260 } 3261 return true; 3262} 3263 3264bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3265 SmallVector<int, 8> M; 3266 N->getMask(M); 3267 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3268} 3269 3270/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3271/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3272/// <0, 0, 1, 1> 3273static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3274 int NumElems = VT.getVectorNumElements(); 3275 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3276 return false; 3277 3278 // Handle vector lengths > 128 bits. Define a "section" as a set of 3279 // 128 bits. AVX defines UNPCK* to operate independently on 128-bit 3280 // sections. 3281 unsigned NumSections = VT.getSizeInBits() / 128; 3282 if (NumSections == 0 ) NumSections = 1; // Handle MMX 3283 unsigned NumSectionElts = NumElems / NumSections; 3284 3285 for (unsigned s = 0; s < NumSections; ++s) { 3286 for (unsigned i = s * NumSectionElts, j = s * NumSectionElts; 3287 i != NumSectionElts * (s + 1); 3288 i += 2, ++j) { 3289 int BitI = Mask[i]; 3290 int BitI1 = Mask[i+1]; 3291 3292 if (!isUndefOrEqual(BitI, j)) 3293 return false; 3294 if (!isUndefOrEqual(BitI1, j)) 3295 return false; 3296 } 3297 } 3298 3299 return true; 3300} 3301 3302bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3303 SmallVector<int, 8> M; 3304 N->getMask(M); 3305 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3306} 3307 3308/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3309/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3310/// <2, 2, 3, 3> 3311static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3312 int NumElems = VT.getVectorNumElements(); 3313 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3314 return false; 3315 3316 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3317 int BitI = Mask[i]; 3318 int BitI1 = Mask[i+1]; 3319 if (!isUndefOrEqual(BitI, j)) 3320 return false; 3321 if (!isUndefOrEqual(BitI1, j)) 3322 return false; 3323 } 3324 return true; 3325} 3326 3327bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3328 SmallVector<int, 8> M; 3329 N->getMask(M); 3330 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3331} 3332 3333/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3334/// specifies a shuffle of elements that is suitable for input to MOVSS, 3335/// MOVSD, and MOVD, i.e. setting the lowest element. 3336static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3337 if (VT.getVectorElementType().getSizeInBits() < 32) 3338 return false; 3339 3340 int NumElts = VT.getVectorNumElements(); 3341 3342 if (!isUndefOrEqual(Mask[0], NumElts)) 3343 return false; 3344 3345 for (int i = 1; i < NumElts; ++i) 3346 if (!isUndefOrEqual(Mask[i], i)) 3347 return false; 3348 3349 return true; 3350} 3351 3352bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3353 SmallVector<int, 8> M; 3354 N->getMask(M); 3355 return ::isMOVLMask(M, N->getValueType(0)); 3356} 3357 3358/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3359/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3360/// element of vector 2 and the other elements to come from vector 1 in order. 3361static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3362 bool V2IsSplat = false, bool V2IsUndef = false) { 3363 int NumOps = VT.getVectorNumElements(); 3364 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3365 return false; 3366 3367 if (!isUndefOrEqual(Mask[0], 0)) 3368 return false; 3369 3370 for (int i = 1; i < NumOps; ++i) 3371 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3372 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3373 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3374 return false; 3375 3376 return true; 3377} 3378 3379static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3380 bool V2IsUndef = false) { 3381 SmallVector<int, 8> M; 3382 N->getMask(M); 3383 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3384} 3385 3386/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3387/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3388bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3389 if (N->getValueType(0).getVectorNumElements() != 4) 3390 return false; 3391 3392 // Expect 1, 1, 3, 3 3393 for (unsigned i = 0; i < 2; ++i) { 3394 int Elt = N->getMaskElt(i); 3395 if (Elt >= 0 && Elt != 1) 3396 return false; 3397 } 3398 3399 bool HasHi = false; 3400 for (unsigned i = 2; i < 4; ++i) { 3401 int Elt = N->getMaskElt(i); 3402 if (Elt >= 0 && Elt != 3) 3403 return false; 3404 if (Elt == 3) 3405 HasHi = true; 3406 } 3407 // Don't use movshdup if it can be done with a shufps. 3408 // FIXME: verify that matching u, u, 3, 3 is what we want. 3409 return HasHi; 3410} 3411 3412/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3413/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3414bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3415 if (N->getValueType(0).getVectorNumElements() != 4) 3416 return false; 3417 3418 // Expect 0, 0, 2, 2 3419 for (unsigned i = 0; i < 2; ++i) 3420 if (N->getMaskElt(i) > 0) 3421 return false; 3422 3423 bool HasHi = false; 3424 for (unsigned i = 2; i < 4; ++i) { 3425 int Elt = N->getMaskElt(i); 3426 if (Elt >= 0 && Elt != 2) 3427 return false; 3428 if (Elt == 2) 3429 HasHi = true; 3430 } 3431 // Don't use movsldup if it can be done with a shufps. 3432 return HasHi; 3433} 3434 3435/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3436/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3437bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3438 int e = N->getValueType(0).getVectorNumElements() / 2; 3439 3440 for (int i = 0; i < e; ++i) 3441 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3442 return false; 3443 for (int i = 0; i < e; ++i) 3444 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3445 return false; 3446 return true; 3447} 3448 3449/// isVEXTRACTF128Index - Return true if the specified 3450/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3451/// suitable for input to VEXTRACTF128. 3452bool X86::isVEXTRACTF128Index(SDNode *N) { 3453 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3454 return false; 3455 3456 // The index should be aligned on a 128-bit boundary. 3457 uint64_t Index = 3458 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3459 3460 unsigned VL = N->getValueType(0).getVectorNumElements(); 3461 unsigned VBits = N->getValueType(0).getSizeInBits(); 3462 unsigned ElSize = VBits / VL; 3463 bool Result = (Index * ElSize) % 128 == 0; 3464 3465 return Result; 3466} 3467 3468/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3469/// operand specifies a subvector insert that is suitable for input to 3470/// VINSERTF128. 3471bool X86::isVINSERTF128Index(SDNode *N) { 3472 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3473 return false; 3474 3475 // The index should be aligned on a 128-bit boundary. 3476 uint64_t Index = 3477 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3478 3479 unsigned VL = N->getValueType(0).getVectorNumElements(); 3480 unsigned VBits = N->getValueType(0).getSizeInBits(); 3481 unsigned ElSize = VBits / VL; 3482 bool Result = (Index * ElSize) % 128 == 0; 3483 3484 return Result; 3485} 3486 3487/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3488/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3489unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3490 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3491 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3492 3493 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3494 unsigned Mask = 0; 3495 for (int i = 0; i < NumOperands; ++i) { 3496 int Val = SVOp->getMaskElt(NumOperands-i-1); 3497 if (Val < 0) Val = 0; 3498 if (Val >= NumOperands) Val -= NumOperands; 3499 Mask |= Val; 3500 if (i != NumOperands - 1) 3501 Mask <<= Shift; 3502 } 3503 return Mask; 3504} 3505 3506/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3507/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3508unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3509 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3510 unsigned Mask = 0; 3511 // 8 nodes, but we only care about the last 4. 3512 for (unsigned i = 7; i >= 4; --i) { 3513 int Val = SVOp->getMaskElt(i); 3514 if (Val >= 0) 3515 Mask |= (Val - 4); 3516 if (i != 4) 3517 Mask <<= 2; 3518 } 3519 return Mask; 3520} 3521 3522/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3523/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3524unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3525 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3526 unsigned Mask = 0; 3527 // 8 nodes, but we only care about the first 4. 3528 for (int i = 3; i >= 0; --i) { 3529 int Val = SVOp->getMaskElt(i); 3530 if (Val >= 0) 3531 Mask |= Val; 3532 if (i != 0) 3533 Mask <<= 2; 3534 } 3535 return Mask; 3536} 3537 3538/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3539/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3540unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3541 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3542 EVT VVT = N->getValueType(0); 3543 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3544 int Val = 0; 3545 3546 unsigned i, e; 3547 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3548 Val = SVOp->getMaskElt(i); 3549 if (Val >= 0) 3550 break; 3551 } 3552 return (Val - i) * EltSize; 3553} 3554 3555/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3556/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3557/// instructions. 3558unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3559 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3560 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3561 3562 uint64_t Index = 3563 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3564 3565 EVT VecVT = N->getOperand(0).getValueType(); 3566 EVT ElVT = VecVT.getVectorElementType(); 3567 3568 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3569 3570 return Index / NumElemsPerChunk; 3571} 3572 3573/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3574/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3575/// instructions. 3576unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3577 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3578 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3579 3580 uint64_t Index = 3581 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3582 3583 EVT VecVT = N->getValueType(0); 3584 EVT ElVT = VecVT.getVectorElementType(); 3585 3586 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3587 3588 return Index / NumElemsPerChunk; 3589} 3590 3591/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3592/// constant +0.0. 3593bool X86::isZeroNode(SDValue Elt) { 3594 return ((isa<ConstantSDNode>(Elt) && 3595 cast<ConstantSDNode>(Elt)->isNullValue()) || 3596 (isa<ConstantFPSDNode>(Elt) && 3597 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3598} 3599 3600/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3601/// their permute mask. 3602static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3603 SelectionDAG &DAG) { 3604 EVT VT = SVOp->getValueType(0); 3605 unsigned NumElems = VT.getVectorNumElements(); 3606 SmallVector<int, 8> MaskVec; 3607 3608 for (unsigned i = 0; i != NumElems; ++i) { 3609 int idx = SVOp->getMaskElt(i); 3610 if (idx < 0) 3611 MaskVec.push_back(idx); 3612 else if (idx < (int)NumElems) 3613 MaskVec.push_back(idx + NumElems); 3614 else 3615 MaskVec.push_back(idx - NumElems); 3616 } 3617 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3618 SVOp->getOperand(0), &MaskVec[0]); 3619} 3620 3621/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3622/// the two vector operands have swapped position. 3623static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3624 unsigned NumElems = VT.getVectorNumElements(); 3625 for (unsigned i = 0; i != NumElems; ++i) { 3626 int idx = Mask[i]; 3627 if (idx < 0) 3628 continue; 3629 else if (idx < (int)NumElems) 3630 Mask[i] = idx + NumElems; 3631 else 3632 Mask[i] = idx - NumElems; 3633 } 3634} 3635 3636/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3637/// match movhlps. The lower half elements should come from upper half of 3638/// V1 (and in order), and the upper half elements should come from the upper 3639/// half of V2 (and in order). 3640static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3641 if (Op->getValueType(0).getVectorNumElements() != 4) 3642 return false; 3643 for (unsigned i = 0, e = 2; i != e; ++i) 3644 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3645 return false; 3646 for (unsigned i = 2; i != 4; ++i) 3647 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3648 return false; 3649 return true; 3650} 3651 3652/// isScalarLoadToVector - Returns true if the node is a scalar load that 3653/// is promoted to a vector. It also returns the LoadSDNode by reference if 3654/// required. 3655static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3656 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3657 return false; 3658 N = N->getOperand(0).getNode(); 3659 if (!ISD::isNON_EXTLoad(N)) 3660 return false; 3661 if (LD) 3662 *LD = cast<LoadSDNode>(N); 3663 return true; 3664} 3665 3666/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3667/// match movlp{s|d}. The lower half elements should come from lower half of 3668/// V1 (and in order), and the upper half elements should come from the upper 3669/// half of V2 (and in order). And since V1 will become the source of the 3670/// MOVLP, it must be either a vector load or a scalar load to vector. 3671static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3672 ShuffleVectorSDNode *Op) { 3673 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3674 return false; 3675 // Is V2 is a vector load, don't do this transformation. We will try to use 3676 // load folding shufps op. 3677 if (ISD::isNON_EXTLoad(V2)) 3678 return false; 3679 3680 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3681 3682 if (NumElems != 2 && NumElems != 4) 3683 return false; 3684 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3685 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3686 return false; 3687 for (unsigned i = NumElems/2; i != NumElems; ++i) 3688 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3689 return false; 3690 return true; 3691} 3692 3693/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3694/// all the same. 3695static bool isSplatVector(SDNode *N) { 3696 if (N->getOpcode() != ISD::BUILD_VECTOR) 3697 return false; 3698 3699 SDValue SplatValue = N->getOperand(0); 3700 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3701 if (N->getOperand(i) != SplatValue) 3702 return false; 3703 return true; 3704} 3705 3706/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3707/// to an zero vector. 3708/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3709static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3710 SDValue V1 = N->getOperand(0); 3711 SDValue V2 = N->getOperand(1); 3712 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3713 for (unsigned i = 0; i != NumElems; ++i) { 3714 int Idx = N->getMaskElt(i); 3715 if (Idx >= (int)NumElems) { 3716 unsigned Opc = V2.getOpcode(); 3717 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3718 continue; 3719 if (Opc != ISD::BUILD_VECTOR || 3720 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3721 return false; 3722 } else if (Idx >= 0) { 3723 unsigned Opc = V1.getOpcode(); 3724 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3725 continue; 3726 if (Opc != ISD::BUILD_VECTOR || 3727 !X86::isZeroNode(V1.getOperand(Idx))) 3728 return false; 3729 } 3730 } 3731 return true; 3732} 3733 3734/// getZeroVector - Returns a vector of specified type with all zero elements. 3735/// 3736static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3737 DebugLoc dl) { 3738 assert(VT.isVector() && "Expected a vector type"); 3739 3740 // Always build SSE zero vectors as <4 x i32> bitcasted 3741 // to their dest type. This ensures they get CSE'd. 3742 SDValue Vec; 3743 if (VT.getSizeInBits() == 128) { // SSE 3744 if (HasSSE2) { // SSE2 3745 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3746 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3747 } else { // SSE1 3748 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3749 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3750 } 3751 } else if (VT.getSizeInBits() == 256) { // AVX 3752 // 256-bit logic and arithmetic instructions in AVX are 3753 // all floating-point, no support for integer ops. Default 3754 // to emitting fp zeroed vectors then. 3755 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3756 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3757 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3758 } 3759 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3760} 3761 3762/// getOnesVector - Returns a vector of specified type with all bits set. 3763/// 3764static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3765 assert(VT.isVector() && "Expected a vector type"); 3766 3767 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3768 // type. This ensures they get CSE'd. 3769 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3770 SDValue Vec; 3771 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3772 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3773} 3774 3775 3776/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3777/// that point to V2 points to its first element. 3778static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3779 EVT VT = SVOp->getValueType(0); 3780 unsigned NumElems = VT.getVectorNumElements(); 3781 3782 bool Changed = false; 3783 SmallVector<int, 8> MaskVec; 3784 SVOp->getMask(MaskVec); 3785 3786 for (unsigned i = 0; i != NumElems; ++i) { 3787 if (MaskVec[i] > (int)NumElems) { 3788 MaskVec[i] = NumElems; 3789 Changed = true; 3790 } 3791 } 3792 if (Changed) 3793 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3794 SVOp->getOperand(1), &MaskVec[0]); 3795 return SDValue(SVOp, 0); 3796} 3797 3798/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3799/// operation of specified width. 3800static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3801 SDValue V2) { 3802 unsigned NumElems = VT.getVectorNumElements(); 3803 SmallVector<int, 8> Mask; 3804 Mask.push_back(NumElems); 3805 for (unsigned i = 1; i != NumElems; ++i) 3806 Mask.push_back(i); 3807 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3808} 3809 3810/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3811static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3812 SDValue V2) { 3813 unsigned NumElems = VT.getVectorNumElements(); 3814 SmallVector<int, 8> Mask; 3815 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3816 Mask.push_back(i); 3817 Mask.push_back(i + NumElems); 3818 } 3819 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3820} 3821 3822/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3823static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3824 SDValue V2) { 3825 unsigned NumElems = VT.getVectorNumElements(); 3826 unsigned Half = NumElems/2; 3827 SmallVector<int, 8> Mask; 3828 for (unsigned i = 0; i != Half; ++i) { 3829 Mask.push_back(i + Half); 3830 Mask.push_back(i + NumElems + Half); 3831 } 3832 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3833} 3834 3835/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3836static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3837 EVT PVT = MVT::v4f32; 3838 EVT VT = SV->getValueType(0); 3839 DebugLoc dl = SV->getDebugLoc(); 3840 SDValue V1 = SV->getOperand(0); 3841 int NumElems = VT.getVectorNumElements(); 3842 int EltNo = SV->getSplatIndex(); 3843 3844 // unpack elements to the correct location 3845 while (NumElems > 4) { 3846 if (EltNo < NumElems/2) { 3847 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3848 } else { 3849 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3850 EltNo -= NumElems/2; 3851 } 3852 NumElems >>= 1; 3853 } 3854 3855 // Perform the splat. 3856 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3857 V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); 3858 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3859 return DAG.getNode(ISD::BITCAST, dl, VT, V1); 3860} 3861 3862/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3863/// vector of zero or undef vector. This produces a shuffle where the low 3864/// element of V2 is swizzled into the zero/undef vector, landing at element 3865/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3866static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3867 bool isZero, bool HasSSE2, 3868 SelectionDAG &DAG) { 3869 EVT VT = V2.getValueType(); 3870 SDValue V1 = isZero 3871 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3872 unsigned NumElems = VT.getVectorNumElements(); 3873 SmallVector<int, 16> MaskVec; 3874 for (unsigned i = 0; i != NumElems; ++i) 3875 // If this is the insertion idx, put the low elt of V2 here. 3876 MaskVec.push_back(i == Idx ? NumElems : i); 3877 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3878} 3879 3880/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3881/// element of the result of the vector shuffle. 3882SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3883 unsigned Depth) { 3884 if (Depth == 6) 3885 return SDValue(); // Limit search depth. 3886 3887 SDValue V = SDValue(N, 0); 3888 EVT VT = V.getValueType(); 3889 unsigned Opcode = V.getOpcode(); 3890 3891 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3892 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3893 Index = SV->getMaskElt(Index); 3894 3895 if (Index < 0) 3896 return DAG.getUNDEF(VT.getVectorElementType()); 3897 3898 int NumElems = VT.getVectorNumElements(); 3899 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3900 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3901 } 3902 3903 // Recurse into target specific vector shuffles to find scalars. 3904 if (isTargetShuffle(Opcode)) { 3905 int NumElems = VT.getVectorNumElements(); 3906 SmallVector<unsigned, 16> ShuffleMask; 3907 SDValue ImmN; 3908 3909 switch(Opcode) { 3910 case X86ISD::SHUFPS: 3911 case X86ISD::SHUFPD: 3912 ImmN = N->getOperand(N->getNumOperands()-1); 3913 DecodeSHUFPSMask(NumElems, 3914 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3915 ShuffleMask); 3916 break; 3917 case X86ISD::PUNPCKHBW: 3918 case X86ISD::PUNPCKHWD: 3919 case X86ISD::PUNPCKHDQ: 3920 case X86ISD::PUNPCKHQDQ: 3921 DecodePUNPCKHMask(NumElems, ShuffleMask); 3922 break; 3923 case X86ISD::UNPCKHPS: 3924 case X86ISD::UNPCKHPD: 3925 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3926 break; 3927 case X86ISD::PUNPCKLBW: 3928 case X86ISD::PUNPCKLWD: 3929 case X86ISD::PUNPCKLDQ: 3930 case X86ISD::PUNPCKLQDQ: 3931 DecodePUNPCKLMask(VT, ShuffleMask); 3932 break; 3933 case X86ISD::UNPCKLPS: 3934 case X86ISD::UNPCKLPD: 3935 case X86ISD::VUNPCKLPS: 3936 case X86ISD::VUNPCKLPD: 3937 case X86ISD::VUNPCKLPSY: 3938 case X86ISD::VUNPCKLPDY: 3939 DecodeUNPCKLPMask(VT, ShuffleMask); 3940 break; 3941 case X86ISD::MOVHLPS: 3942 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3943 break; 3944 case X86ISD::MOVLHPS: 3945 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3946 break; 3947 case X86ISD::PSHUFD: 3948 ImmN = N->getOperand(N->getNumOperands()-1); 3949 DecodePSHUFMask(NumElems, 3950 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3951 ShuffleMask); 3952 break; 3953 case X86ISD::PSHUFHW: 3954 ImmN = N->getOperand(N->getNumOperands()-1); 3955 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3956 ShuffleMask); 3957 break; 3958 case X86ISD::PSHUFLW: 3959 ImmN = N->getOperand(N->getNumOperands()-1); 3960 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3961 ShuffleMask); 3962 break; 3963 case X86ISD::MOVSS: 3964 case X86ISD::MOVSD: { 3965 // The index 0 always comes from the first element of the second source, 3966 // this is why MOVSS and MOVSD are used in the first place. The other 3967 // elements come from the other positions of the first source vector. 3968 unsigned OpNum = (Index == 0) ? 1 : 0; 3969 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3970 Depth+1); 3971 } 3972 default: 3973 assert("not implemented for target shuffle node"); 3974 return SDValue(); 3975 } 3976 3977 Index = ShuffleMask[Index]; 3978 if (Index < 0) 3979 return DAG.getUNDEF(VT.getVectorElementType()); 3980 3981 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3982 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3983 Depth+1); 3984 } 3985 3986 // Actual nodes that may contain scalar elements 3987 if (Opcode == ISD::BITCAST) { 3988 V = V.getOperand(0); 3989 EVT SrcVT = V.getValueType(); 3990 unsigned NumElems = VT.getVectorNumElements(); 3991 3992 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3993 return SDValue(); 3994 } 3995 3996 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3997 return (Index == 0) ? V.getOperand(0) 3998 : DAG.getUNDEF(VT.getVectorElementType()); 3999 4000 if (V.getOpcode() == ISD::BUILD_VECTOR) 4001 return V.getOperand(Index); 4002 4003 return SDValue(); 4004} 4005 4006/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4007/// shuffle operation which come from a consecutively from a zero. The 4008/// search can start in two diferent directions, from left or right. 4009static 4010unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4011 bool ZerosFromLeft, SelectionDAG &DAG) { 4012 int i = 0; 4013 4014 while (i < NumElems) { 4015 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4016 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4017 if (!(Elt.getNode() && 4018 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4019 break; 4020 ++i; 4021 } 4022 4023 return i; 4024} 4025 4026/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4027/// MaskE correspond consecutively to elements from one of the vector operands, 4028/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4029static 4030bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4031 int OpIdx, int NumElems, unsigned &OpNum) { 4032 bool SeenV1 = false; 4033 bool SeenV2 = false; 4034 4035 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4036 int Idx = SVOp->getMaskElt(i); 4037 // Ignore undef indicies 4038 if (Idx < 0) 4039 continue; 4040 4041 if (Idx < NumElems) 4042 SeenV1 = true; 4043 else 4044 SeenV2 = true; 4045 4046 // Only accept consecutive elements from the same vector 4047 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4048 return false; 4049 } 4050 4051 OpNum = SeenV1 ? 0 : 1; 4052 return true; 4053} 4054 4055/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4056/// logical left shift of a vector. 4057static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4058 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4059 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4060 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4061 false /* check zeros from right */, DAG); 4062 unsigned OpSrc; 4063 4064 if (!NumZeros) 4065 return false; 4066 4067 // Considering the elements in the mask that are not consecutive zeros, 4068 // check if they consecutively come from only one of the source vectors. 4069 // 4070 // V1 = {X, A, B, C} 0 4071 // \ \ \ / 4072 // vector_shuffle V1, V2 <1, 2, 3, X> 4073 // 4074 if (!isShuffleMaskConsecutive(SVOp, 4075 0, // Mask Start Index 4076 NumElems-NumZeros-1, // Mask End Index 4077 NumZeros, // Where to start looking in the src vector 4078 NumElems, // Number of elements in vector 4079 OpSrc)) // Which source operand ? 4080 return false; 4081 4082 isLeft = false; 4083 ShAmt = NumZeros; 4084 ShVal = SVOp->getOperand(OpSrc); 4085 return true; 4086} 4087 4088/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4089/// logical left shift of a vector. 4090static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4091 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4092 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4093 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4094 true /* check zeros from left */, DAG); 4095 unsigned OpSrc; 4096 4097 if (!NumZeros) 4098 return false; 4099 4100 // Considering the elements in the mask that are not consecutive zeros, 4101 // check if they consecutively come from only one of the source vectors. 4102 // 4103 // 0 { A, B, X, X } = V2 4104 // / \ / / 4105 // vector_shuffle V1, V2 <X, X, 4, 5> 4106 // 4107 if (!isShuffleMaskConsecutive(SVOp, 4108 NumZeros, // Mask Start Index 4109 NumElems-1, // Mask End Index 4110 0, // Where to start looking in the src vector 4111 NumElems, // Number of elements in vector 4112 OpSrc)) // Which source operand ? 4113 return false; 4114 4115 isLeft = true; 4116 ShAmt = NumZeros; 4117 ShVal = SVOp->getOperand(OpSrc); 4118 return true; 4119} 4120 4121/// isVectorShift - Returns true if the shuffle can be implemented as a 4122/// logical left or right shift of a vector. 4123static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4124 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4125 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4126 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4127 return true; 4128 4129 return false; 4130} 4131 4132/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4133/// 4134static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4135 unsigned NumNonZero, unsigned NumZero, 4136 SelectionDAG &DAG, 4137 const TargetLowering &TLI) { 4138 if (NumNonZero > 8) 4139 return SDValue(); 4140 4141 DebugLoc dl = Op.getDebugLoc(); 4142 SDValue V(0, 0); 4143 bool First = true; 4144 for (unsigned i = 0; i < 16; ++i) { 4145 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4146 if (ThisIsNonZero && First) { 4147 if (NumZero) 4148 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4149 else 4150 V = DAG.getUNDEF(MVT::v8i16); 4151 First = false; 4152 } 4153 4154 if ((i & 1) != 0) { 4155 SDValue ThisElt(0, 0), LastElt(0, 0); 4156 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4157 if (LastIsNonZero) { 4158 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4159 MVT::i16, Op.getOperand(i-1)); 4160 } 4161 if (ThisIsNonZero) { 4162 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4163 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4164 ThisElt, DAG.getConstant(8, MVT::i8)); 4165 if (LastIsNonZero) 4166 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4167 } else 4168 ThisElt = LastElt; 4169 4170 if (ThisElt.getNode()) 4171 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4172 DAG.getIntPtrConstant(i/2)); 4173 } 4174 } 4175 4176 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4177} 4178 4179/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4180/// 4181static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4182 unsigned NumNonZero, unsigned NumZero, 4183 SelectionDAG &DAG, 4184 const TargetLowering &TLI) { 4185 if (NumNonZero > 4) 4186 return SDValue(); 4187 4188 DebugLoc dl = Op.getDebugLoc(); 4189 SDValue V(0, 0); 4190 bool First = true; 4191 for (unsigned i = 0; i < 8; ++i) { 4192 bool isNonZero = (NonZeros & (1 << i)) != 0; 4193 if (isNonZero) { 4194 if (First) { 4195 if (NumZero) 4196 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4197 else 4198 V = DAG.getUNDEF(MVT::v8i16); 4199 First = false; 4200 } 4201 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4202 MVT::v8i16, V, Op.getOperand(i), 4203 DAG.getIntPtrConstant(i)); 4204 } 4205 } 4206 4207 return V; 4208} 4209 4210/// getVShift - Return a vector logical shift node. 4211/// 4212static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4213 unsigned NumBits, SelectionDAG &DAG, 4214 const TargetLowering &TLI, DebugLoc dl) { 4215 EVT ShVT = MVT::v2i64; 4216 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4217 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4218 return DAG.getNode(ISD::BITCAST, dl, VT, 4219 DAG.getNode(Opc, dl, ShVT, SrcOp, 4220 DAG.getConstant(NumBits, 4221 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4222} 4223 4224SDValue 4225X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4226 SelectionDAG &DAG) const { 4227 4228 // Check if the scalar load can be widened into a vector load. And if 4229 // the address is "base + cst" see if the cst can be "absorbed" into 4230 // the shuffle mask. 4231 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4232 SDValue Ptr = LD->getBasePtr(); 4233 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4234 return SDValue(); 4235 EVT PVT = LD->getValueType(0); 4236 if (PVT != MVT::i32 && PVT != MVT::f32) 4237 return SDValue(); 4238 4239 int FI = -1; 4240 int64_t Offset = 0; 4241 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4242 FI = FINode->getIndex(); 4243 Offset = 0; 4244 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4245 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4246 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4247 Offset = Ptr.getConstantOperandVal(1); 4248 Ptr = Ptr.getOperand(0); 4249 } else { 4250 return SDValue(); 4251 } 4252 4253 SDValue Chain = LD->getChain(); 4254 // Make sure the stack object alignment is at least 16. 4255 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4256 if (DAG.InferPtrAlignment(Ptr) < 16) { 4257 if (MFI->isFixedObjectIndex(FI)) { 4258 // Can't change the alignment. FIXME: It's possible to compute 4259 // the exact stack offset and reference FI + adjust offset instead. 4260 // If someone *really* cares about this. That's the way to implement it. 4261 return SDValue(); 4262 } else { 4263 MFI->setObjectAlignment(FI, 16); 4264 } 4265 } 4266 4267 // (Offset % 16) must be multiple of 4. Then address is then 4268 // Ptr + (Offset & ~15). 4269 if (Offset < 0) 4270 return SDValue(); 4271 if ((Offset % 16) & 3) 4272 return SDValue(); 4273 int64_t StartOffset = Offset & ~15; 4274 if (StartOffset) 4275 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4276 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4277 4278 int EltNo = (Offset - StartOffset) >> 2; 4279 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4280 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4281 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4282 LD->getPointerInfo().getWithOffset(StartOffset), 4283 false, false, 0); 4284 // Canonicalize it to a v4i32 shuffle. 4285 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4286 return DAG.getNode(ISD::BITCAST, dl, VT, 4287 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4288 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4289 } 4290 4291 return SDValue(); 4292} 4293 4294/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4295/// vector of type 'VT', see if the elements can be replaced by a single large 4296/// load which has the same value as a build_vector whose operands are 'elts'. 4297/// 4298/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4299/// 4300/// FIXME: we'd also like to handle the case where the last elements are zero 4301/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4302/// There's even a handy isZeroNode for that purpose. 4303static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4304 DebugLoc &DL, SelectionDAG &DAG) { 4305 EVT EltVT = VT.getVectorElementType(); 4306 unsigned NumElems = Elts.size(); 4307 4308 LoadSDNode *LDBase = NULL; 4309 unsigned LastLoadedElt = -1U; 4310 4311 // For each element in the initializer, see if we've found a load or an undef. 4312 // If we don't find an initial load element, or later load elements are 4313 // non-consecutive, bail out. 4314 for (unsigned i = 0; i < NumElems; ++i) { 4315 SDValue Elt = Elts[i]; 4316 4317 if (!Elt.getNode() || 4318 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4319 return SDValue(); 4320 if (!LDBase) { 4321 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4322 return SDValue(); 4323 LDBase = cast<LoadSDNode>(Elt.getNode()); 4324 LastLoadedElt = i; 4325 continue; 4326 } 4327 if (Elt.getOpcode() == ISD::UNDEF) 4328 continue; 4329 4330 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4331 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4332 return SDValue(); 4333 LastLoadedElt = i; 4334 } 4335 4336 // If we have found an entire vector of loads and undefs, then return a large 4337 // load of the entire vector width starting at the base pointer. If we found 4338 // consecutive loads for the low half, generate a vzext_load node. 4339 if (LastLoadedElt == NumElems - 1) { 4340 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4341 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4342 LDBase->getPointerInfo(), 4343 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4344 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4345 LDBase->getPointerInfo(), 4346 LDBase->isVolatile(), LDBase->isNonTemporal(), 4347 LDBase->getAlignment()); 4348 } else if (NumElems == 4 && LastLoadedElt == 1) { 4349 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4350 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4351 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4352 Ops, 2, MVT::i32, 4353 LDBase->getMemOperand()); 4354 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4355 } 4356 return SDValue(); 4357} 4358 4359SDValue 4360X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4361 DebugLoc dl = Op.getDebugLoc(); 4362 4363 EVT VT = Op.getValueType(); 4364 EVT ExtVT = VT.getVectorElementType(); 4365 4366 unsigned NumElems = Op.getNumOperands(); 4367 4368 // For AVX-length vectors, build the individual 128-bit pieces and 4369 // use shuffles to put them in place. 4370 if (VT.getSizeInBits() > 256 && 4371 Subtarget->hasAVX() && 4372 !ISD::isBuildVectorAllZeros(Op.getNode())) { 4373 SmallVector<SDValue, 8> V; 4374 V.resize(NumElems); 4375 for (unsigned i = 0; i < NumElems; ++i) { 4376 V[i] = Op.getOperand(i); 4377 } 4378 4379 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4380 4381 // Build the lower subvector. 4382 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4383 // Build the upper subvector. 4384 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4385 NumElems/2); 4386 4387 return ConcatVectors(Lower, Upper, DAG); 4388 } 4389 4390 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4391 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4392 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4393 // is present, so AllOnes is ignored. 4394 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4395 (Op.getValueType().getSizeInBits() != 256 && 4396 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4397 // Canonicalize this to <4 x i32> (SSE) to 4398 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4399 // eliminated on x86-32 hosts. 4400 if (Op.getValueType() == MVT::v4i32) 4401 return Op; 4402 4403 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4404 return getOnesVector(Op.getValueType(), DAG, dl); 4405 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4406 } 4407 4408 unsigned EVTBits = ExtVT.getSizeInBits(); 4409 4410 unsigned NumZero = 0; 4411 unsigned NumNonZero = 0; 4412 unsigned NonZeros = 0; 4413 bool IsAllConstants = true; 4414 SmallSet<SDValue, 8> Values; 4415 for (unsigned i = 0; i < NumElems; ++i) { 4416 SDValue Elt = Op.getOperand(i); 4417 if (Elt.getOpcode() == ISD::UNDEF) 4418 continue; 4419 Values.insert(Elt); 4420 if (Elt.getOpcode() != ISD::Constant && 4421 Elt.getOpcode() != ISD::ConstantFP) 4422 IsAllConstants = false; 4423 if (X86::isZeroNode(Elt)) 4424 NumZero++; 4425 else { 4426 NonZeros |= (1 << i); 4427 NumNonZero++; 4428 } 4429 } 4430 4431 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4432 if (NumNonZero == 0) 4433 return DAG.getUNDEF(VT); 4434 4435 // Special case for single non-zero, non-undef, element. 4436 if (NumNonZero == 1) { 4437 unsigned Idx = CountTrailingZeros_32(NonZeros); 4438 SDValue Item = Op.getOperand(Idx); 4439 4440 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4441 // the value are obviously zero, truncate the value to i32 and do the 4442 // insertion that way. Only do this if the value is non-constant or if the 4443 // value is a constant being inserted into element 0. It is cheaper to do 4444 // a constant pool load than it is to do a movd + shuffle. 4445 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4446 (!IsAllConstants || Idx == 0)) { 4447 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4448 // Handle SSE only. 4449 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4450 EVT VecVT = MVT::v4i32; 4451 unsigned VecElts = 4; 4452 4453 // Truncate the value (which may itself be a constant) to i32, and 4454 // convert it to a vector with movd (S2V+shuffle to zero extend). 4455 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4456 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4457 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4458 Subtarget->hasSSE2(), DAG); 4459 4460 // Now we have our 32-bit value zero extended in the low element of 4461 // a vector. If Idx != 0, swizzle it into place. 4462 if (Idx != 0) { 4463 SmallVector<int, 4> Mask; 4464 Mask.push_back(Idx); 4465 for (unsigned i = 1; i != VecElts; ++i) 4466 Mask.push_back(i); 4467 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4468 DAG.getUNDEF(Item.getValueType()), 4469 &Mask[0]); 4470 } 4471 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4472 } 4473 } 4474 4475 // If we have a constant or non-constant insertion into the low element of 4476 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4477 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4478 // depending on what the source datatype is. 4479 if (Idx == 0) { 4480 if (NumZero == 0) { 4481 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4482 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4483 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4484 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4485 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4486 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4487 DAG); 4488 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4489 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4490 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4491 EVT MiddleVT = MVT::v4i32; 4492 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4493 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4494 Subtarget->hasSSE2(), DAG); 4495 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4496 } 4497 } 4498 4499 // Is it a vector logical left shift? 4500 if (NumElems == 2 && Idx == 1 && 4501 X86::isZeroNode(Op.getOperand(0)) && 4502 !X86::isZeroNode(Op.getOperand(1))) { 4503 unsigned NumBits = VT.getSizeInBits(); 4504 return getVShift(true, VT, 4505 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4506 VT, Op.getOperand(1)), 4507 NumBits/2, DAG, *this, dl); 4508 } 4509 4510 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4511 return SDValue(); 4512 4513 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4514 // is a non-constant being inserted into an element other than the low one, 4515 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4516 // movd/movss) to move this into the low element, then shuffle it into 4517 // place. 4518 if (EVTBits == 32) { 4519 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4520 4521 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4522 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4523 Subtarget->hasSSE2(), DAG); 4524 SmallVector<int, 8> MaskVec; 4525 for (unsigned i = 0; i < NumElems; i++) 4526 MaskVec.push_back(i == Idx ? 0 : 1); 4527 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4528 } 4529 } 4530 4531 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4532 if (Values.size() == 1) { 4533 if (EVTBits == 32) { 4534 // Instead of a shuffle like this: 4535 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4536 // Check if it's possible to issue this instead. 4537 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4538 unsigned Idx = CountTrailingZeros_32(NonZeros); 4539 SDValue Item = Op.getOperand(Idx); 4540 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4541 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4542 } 4543 return SDValue(); 4544 } 4545 4546 // A vector full of immediates; various special cases are already 4547 // handled, so this is best done with a single constant-pool load. 4548 if (IsAllConstants) 4549 return SDValue(); 4550 4551 // Let legalizer expand 2-wide build_vectors. 4552 if (EVTBits == 64) { 4553 if (NumNonZero == 1) { 4554 // One half is zero or undef. 4555 unsigned Idx = CountTrailingZeros_32(NonZeros); 4556 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4557 Op.getOperand(Idx)); 4558 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4559 Subtarget->hasSSE2(), DAG); 4560 } 4561 return SDValue(); 4562 } 4563 4564 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4565 if (EVTBits == 8 && NumElems == 16) { 4566 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4567 *this); 4568 if (V.getNode()) return V; 4569 } 4570 4571 if (EVTBits == 16 && NumElems == 8) { 4572 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4573 *this); 4574 if (V.getNode()) return V; 4575 } 4576 4577 // If element VT is == 32 bits, turn it into a number of shuffles. 4578 SmallVector<SDValue, 8> V; 4579 V.resize(NumElems); 4580 if (NumElems == 4 && NumZero > 0) { 4581 for (unsigned i = 0; i < 4; ++i) { 4582 bool isZero = !(NonZeros & (1 << i)); 4583 if (isZero) 4584 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4585 else 4586 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4587 } 4588 4589 for (unsigned i = 0; i < 2; ++i) { 4590 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4591 default: break; 4592 case 0: 4593 V[i] = V[i*2]; // Must be a zero vector. 4594 break; 4595 case 1: 4596 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4597 break; 4598 case 2: 4599 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4600 break; 4601 case 3: 4602 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4603 break; 4604 } 4605 } 4606 4607 SmallVector<int, 8> MaskVec; 4608 bool Reverse = (NonZeros & 0x3) == 2; 4609 for (unsigned i = 0; i < 2; ++i) 4610 MaskVec.push_back(Reverse ? 1-i : i); 4611 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4612 for (unsigned i = 0; i < 2; ++i) 4613 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4614 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4615 } 4616 4617 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4618 // Check for a build vector of consecutive loads. 4619 for (unsigned i = 0; i < NumElems; ++i) 4620 V[i] = Op.getOperand(i); 4621 4622 // Check for elements which are consecutive loads. 4623 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4624 if (LD.getNode()) 4625 return LD; 4626 4627 // For SSE 4.1, use insertps to put the high elements into the low element. 4628 if (getSubtarget()->hasSSE41()) { 4629 SDValue Result; 4630 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4631 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4632 else 4633 Result = DAG.getUNDEF(VT); 4634 4635 for (unsigned i = 1; i < NumElems; ++i) { 4636 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4637 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4638 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4639 } 4640 return Result; 4641 } 4642 4643 // Otherwise, expand into a number of unpckl*, start by extending each of 4644 // our (non-undef) elements to the full vector width with the element in the 4645 // bottom slot of the vector (which generates no code for SSE). 4646 for (unsigned i = 0; i < NumElems; ++i) { 4647 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4648 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4649 else 4650 V[i] = DAG.getUNDEF(VT); 4651 } 4652 4653 // Next, we iteratively mix elements, e.g. for v4f32: 4654 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4655 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4656 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4657 unsigned EltStride = NumElems >> 1; 4658 while (EltStride != 0) { 4659 for (unsigned i = 0; i < EltStride; ++i) { 4660 // If V[i+EltStride] is undef and this is the first round of mixing, 4661 // then it is safe to just drop this shuffle: V[i] is already in the 4662 // right place, the one element (since it's the first round) being 4663 // inserted as undef can be dropped. This isn't safe for successive 4664 // rounds because they will permute elements within both vectors. 4665 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4666 EltStride == NumElems/2) 4667 continue; 4668 4669 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4670 } 4671 EltStride >>= 1; 4672 } 4673 return V[0]; 4674 } 4675 return SDValue(); 4676} 4677 4678SDValue 4679X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4680 // We support concatenate two MMX registers and place them in a MMX 4681 // register. This is better than doing a stack convert. 4682 DebugLoc dl = Op.getDebugLoc(); 4683 EVT ResVT = Op.getValueType(); 4684 assert(Op.getNumOperands() == 2); 4685 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4686 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4687 int Mask[2]; 4688 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4689 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4690 InVec = Op.getOperand(1); 4691 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4692 unsigned NumElts = ResVT.getVectorNumElements(); 4693 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4694 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4695 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4696 } else { 4697 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4698 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4699 Mask[0] = 0; Mask[1] = 2; 4700 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4701 } 4702 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4703} 4704 4705// v8i16 shuffles - Prefer shuffles in the following order: 4706// 1. [all] pshuflw, pshufhw, optional move 4707// 2. [ssse3] 1 x pshufb 4708// 3. [ssse3] 2 x pshufb + 1 x por 4709// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4710SDValue 4711X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4712 SelectionDAG &DAG) const { 4713 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4714 SDValue V1 = SVOp->getOperand(0); 4715 SDValue V2 = SVOp->getOperand(1); 4716 DebugLoc dl = SVOp->getDebugLoc(); 4717 SmallVector<int, 8> MaskVals; 4718 4719 // Determine if more than 1 of the words in each of the low and high quadwords 4720 // of the result come from the same quadword of one of the two inputs. Undef 4721 // mask values count as coming from any quadword, for better codegen. 4722 SmallVector<unsigned, 4> LoQuad(4); 4723 SmallVector<unsigned, 4> HiQuad(4); 4724 BitVector InputQuads(4); 4725 for (unsigned i = 0; i < 8; ++i) { 4726 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4727 int EltIdx = SVOp->getMaskElt(i); 4728 MaskVals.push_back(EltIdx); 4729 if (EltIdx < 0) { 4730 ++Quad[0]; 4731 ++Quad[1]; 4732 ++Quad[2]; 4733 ++Quad[3]; 4734 continue; 4735 } 4736 ++Quad[EltIdx / 4]; 4737 InputQuads.set(EltIdx / 4); 4738 } 4739 4740 int BestLoQuad = -1; 4741 unsigned MaxQuad = 1; 4742 for (unsigned i = 0; i < 4; ++i) { 4743 if (LoQuad[i] > MaxQuad) { 4744 BestLoQuad = i; 4745 MaxQuad = LoQuad[i]; 4746 } 4747 } 4748 4749 int BestHiQuad = -1; 4750 MaxQuad = 1; 4751 for (unsigned i = 0; i < 4; ++i) { 4752 if (HiQuad[i] > MaxQuad) { 4753 BestHiQuad = i; 4754 MaxQuad = HiQuad[i]; 4755 } 4756 } 4757 4758 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4759 // of the two input vectors, shuffle them into one input vector so only a 4760 // single pshufb instruction is necessary. If There are more than 2 input 4761 // quads, disable the next transformation since it does not help SSSE3. 4762 bool V1Used = InputQuads[0] || InputQuads[1]; 4763 bool V2Used = InputQuads[2] || InputQuads[3]; 4764 if (Subtarget->hasSSSE3()) { 4765 if (InputQuads.count() == 2 && V1Used && V2Used) { 4766 BestLoQuad = InputQuads.find_first(); 4767 BestHiQuad = InputQuads.find_next(BestLoQuad); 4768 } 4769 if (InputQuads.count() > 2) { 4770 BestLoQuad = -1; 4771 BestHiQuad = -1; 4772 } 4773 } 4774 4775 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4776 // the shuffle mask. If a quad is scored as -1, that means that it contains 4777 // words from all 4 input quadwords. 4778 SDValue NewV; 4779 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4780 SmallVector<int, 8> MaskV; 4781 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4782 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4783 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4784 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4785 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4786 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4787 4788 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4789 // source words for the shuffle, to aid later transformations. 4790 bool AllWordsInNewV = true; 4791 bool InOrder[2] = { true, true }; 4792 for (unsigned i = 0; i != 8; ++i) { 4793 int idx = MaskVals[i]; 4794 if (idx != (int)i) 4795 InOrder[i/4] = false; 4796 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4797 continue; 4798 AllWordsInNewV = false; 4799 break; 4800 } 4801 4802 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4803 if (AllWordsInNewV) { 4804 for (int i = 0; i != 8; ++i) { 4805 int idx = MaskVals[i]; 4806 if (idx < 0) 4807 continue; 4808 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4809 if ((idx != i) && idx < 4) 4810 pshufhw = false; 4811 if ((idx != i) && idx > 3) 4812 pshuflw = false; 4813 } 4814 V1 = NewV; 4815 V2Used = false; 4816 BestLoQuad = 0; 4817 BestHiQuad = 1; 4818 } 4819 4820 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4821 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4822 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4823 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4824 unsigned TargetMask = 0; 4825 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4826 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4827 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4828 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4829 V1 = NewV.getOperand(0); 4830 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4831 } 4832 } 4833 4834 // If we have SSSE3, and all words of the result are from 1 input vector, 4835 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4836 // is present, fall back to case 4. 4837 if (Subtarget->hasSSSE3()) { 4838 SmallVector<SDValue,16> pshufbMask; 4839 4840 // If we have elements from both input vectors, set the high bit of the 4841 // shuffle mask element to zero out elements that come from V2 in the V1 4842 // mask, and elements that come from V1 in the V2 mask, so that the two 4843 // results can be OR'd together. 4844 bool TwoInputs = V1Used && V2Used; 4845 for (unsigned i = 0; i != 8; ++i) { 4846 int EltIdx = MaskVals[i] * 2; 4847 if (TwoInputs && (EltIdx >= 16)) { 4848 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4849 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4850 continue; 4851 } 4852 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4853 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4854 } 4855 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 4856 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4857 DAG.getNode(ISD::BUILD_VECTOR, dl, 4858 MVT::v16i8, &pshufbMask[0], 16)); 4859 if (!TwoInputs) 4860 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4861 4862 // Calculate the shuffle mask for the second input, shuffle it, and 4863 // OR it with the first shuffled input. 4864 pshufbMask.clear(); 4865 for (unsigned i = 0; i != 8; ++i) { 4866 int EltIdx = MaskVals[i] * 2; 4867 if (EltIdx < 16) { 4868 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4869 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4870 continue; 4871 } 4872 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4873 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4874 } 4875 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 4876 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4877 DAG.getNode(ISD::BUILD_VECTOR, dl, 4878 MVT::v16i8, &pshufbMask[0], 16)); 4879 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4880 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4881 } 4882 4883 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4884 // and update MaskVals with new element order. 4885 BitVector InOrder(8); 4886 if (BestLoQuad >= 0) { 4887 SmallVector<int, 8> MaskV; 4888 for (int i = 0; i != 4; ++i) { 4889 int idx = MaskVals[i]; 4890 if (idx < 0) { 4891 MaskV.push_back(-1); 4892 InOrder.set(i); 4893 } else if ((idx / 4) == BestLoQuad) { 4894 MaskV.push_back(idx & 3); 4895 InOrder.set(i); 4896 } else { 4897 MaskV.push_back(-1); 4898 } 4899 } 4900 for (unsigned i = 4; i != 8; ++i) 4901 MaskV.push_back(i); 4902 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4903 &MaskV[0]); 4904 4905 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4906 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4907 NewV.getOperand(0), 4908 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4909 DAG); 4910 } 4911 4912 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4913 // and update MaskVals with the new element order. 4914 if (BestHiQuad >= 0) { 4915 SmallVector<int, 8> MaskV; 4916 for (unsigned i = 0; i != 4; ++i) 4917 MaskV.push_back(i); 4918 for (unsigned i = 4; i != 8; ++i) { 4919 int idx = MaskVals[i]; 4920 if (idx < 0) { 4921 MaskV.push_back(-1); 4922 InOrder.set(i); 4923 } else if ((idx / 4) == BestHiQuad) { 4924 MaskV.push_back((idx & 3) + 4); 4925 InOrder.set(i); 4926 } else { 4927 MaskV.push_back(-1); 4928 } 4929 } 4930 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4931 &MaskV[0]); 4932 4933 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4934 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4935 NewV.getOperand(0), 4936 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4937 DAG); 4938 } 4939 4940 // In case BestHi & BestLo were both -1, which means each quadword has a word 4941 // from each of the four input quadwords, calculate the InOrder bitvector now 4942 // before falling through to the insert/extract cleanup. 4943 if (BestLoQuad == -1 && BestHiQuad == -1) { 4944 NewV = V1; 4945 for (int i = 0; i != 8; ++i) 4946 if (MaskVals[i] < 0 || MaskVals[i] == i) 4947 InOrder.set(i); 4948 } 4949 4950 // The other elements are put in the right place using pextrw and pinsrw. 4951 for (unsigned i = 0; i != 8; ++i) { 4952 if (InOrder[i]) 4953 continue; 4954 int EltIdx = MaskVals[i]; 4955 if (EltIdx < 0) 4956 continue; 4957 SDValue ExtOp = (EltIdx < 8) 4958 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4959 DAG.getIntPtrConstant(EltIdx)) 4960 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4961 DAG.getIntPtrConstant(EltIdx - 8)); 4962 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4963 DAG.getIntPtrConstant(i)); 4964 } 4965 return NewV; 4966} 4967 4968// v16i8 shuffles - Prefer shuffles in the following order: 4969// 1. [ssse3] 1 x pshufb 4970// 2. [ssse3] 2 x pshufb + 1 x por 4971// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4972static 4973SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4974 SelectionDAG &DAG, 4975 const X86TargetLowering &TLI) { 4976 SDValue V1 = SVOp->getOperand(0); 4977 SDValue V2 = SVOp->getOperand(1); 4978 DebugLoc dl = SVOp->getDebugLoc(); 4979 SmallVector<int, 16> MaskVals; 4980 SVOp->getMask(MaskVals); 4981 4982 // If we have SSSE3, case 1 is generated when all result bytes come from 4983 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4984 // present, fall back to case 3. 4985 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4986 bool V1Only = true; 4987 bool V2Only = true; 4988 for (unsigned i = 0; i < 16; ++i) { 4989 int EltIdx = MaskVals[i]; 4990 if (EltIdx < 0) 4991 continue; 4992 if (EltIdx < 16) 4993 V2Only = false; 4994 else 4995 V1Only = false; 4996 } 4997 4998 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4999 if (TLI.getSubtarget()->hasSSSE3()) { 5000 SmallVector<SDValue,16> pshufbMask; 5001 5002 // If all result elements are from one input vector, then only translate 5003 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5004 // 5005 // Otherwise, we have elements from both input vectors, and must zero out 5006 // elements that come from V2 in the first mask, and V1 in the second mask 5007 // so that we can OR them together. 5008 bool TwoInputs = !(V1Only || V2Only); 5009 for (unsigned i = 0; i != 16; ++i) { 5010 int EltIdx = MaskVals[i]; 5011 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5012 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5013 continue; 5014 } 5015 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5016 } 5017 // If all the elements are from V2, assign it to V1 and return after 5018 // building the first pshufb. 5019 if (V2Only) 5020 V1 = V2; 5021 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5022 DAG.getNode(ISD::BUILD_VECTOR, dl, 5023 MVT::v16i8, &pshufbMask[0], 16)); 5024 if (!TwoInputs) 5025 return V1; 5026 5027 // Calculate the shuffle mask for the second input, shuffle it, and 5028 // OR it with the first shuffled input. 5029 pshufbMask.clear(); 5030 for (unsigned i = 0; i != 16; ++i) { 5031 int EltIdx = MaskVals[i]; 5032 if (EltIdx < 16) { 5033 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5034 continue; 5035 } 5036 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5037 } 5038 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5039 DAG.getNode(ISD::BUILD_VECTOR, dl, 5040 MVT::v16i8, &pshufbMask[0], 16)); 5041 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5042 } 5043 5044 // No SSSE3 - Calculate in place words and then fix all out of place words 5045 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5046 // the 16 different words that comprise the two doublequadword input vectors. 5047 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5048 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5049 SDValue NewV = V2Only ? V2 : V1; 5050 for (int i = 0; i != 8; ++i) { 5051 int Elt0 = MaskVals[i*2]; 5052 int Elt1 = MaskVals[i*2+1]; 5053 5054 // This word of the result is all undef, skip it. 5055 if (Elt0 < 0 && Elt1 < 0) 5056 continue; 5057 5058 // This word of the result is already in the correct place, skip it. 5059 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5060 continue; 5061 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5062 continue; 5063 5064 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5065 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5066 SDValue InsElt; 5067 5068 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5069 // using a single extract together, load it and store it. 5070 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5071 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5072 DAG.getIntPtrConstant(Elt1 / 2)); 5073 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5074 DAG.getIntPtrConstant(i)); 5075 continue; 5076 } 5077 5078 // If Elt1 is defined, extract it from the appropriate source. If the 5079 // source byte is not also odd, shift the extracted word left 8 bits 5080 // otherwise clear the bottom 8 bits if we need to do an or. 5081 if (Elt1 >= 0) { 5082 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5083 DAG.getIntPtrConstant(Elt1 / 2)); 5084 if ((Elt1 & 1) == 0) 5085 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5086 DAG.getConstant(8, 5087 TLI.getShiftAmountTy(InsElt.getValueType()))); 5088 else if (Elt0 >= 0) 5089 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5090 DAG.getConstant(0xFF00, MVT::i16)); 5091 } 5092 // If Elt0 is defined, extract it from the appropriate source. If the 5093 // source byte is not also even, shift the extracted word right 8 bits. If 5094 // Elt1 was also defined, OR the extracted values together before 5095 // inserting them in the result. 5096 if (Elt0 >= 0) { 5097 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5098 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5099 if ((Elt0 & 1) != 0) 5100 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5101 DAG.getConstant(8, 5102 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5103 else if (Elt1 >= 0) 5104 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5105 DAG.getConstant(0x00FF, MVT::i16)); 5106 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5107 : InsElt0; 5108 } 5109 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5110 DAG.getIntPtrConstant(i)); 5111 } 5112 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5113} 5114 5115/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5116/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5117/// done when every pair / quad of shuffle mask elements point to elements in 5118/// the right sequence. e.g. 5119/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5120static 5121SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5122 SelectionDAG &DAG, DebugLoc dl) { 5123 EVT VT = SVOp->getValueType(0); 5124 SDValue V1 = SVOp->getOperand(0); 5125 SDValue V2 = SVOp->getOperand(1); 5126 unsigned NumElems = VT.getVectorNumElements(); 5127 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5128 EVT NewVT; 5129 switch (VT.getSimpleVT().SimpleTy) { 5130 default: assert(false && "Unexpected!"); 5131 case MVT::v4f32: NewVT = MVT::v2f64; break; 5132 case MVT::v4i32: NewVT = MVT::v2i64; break; 5133 case MVT::v8i16: NewVT = MVT::v4i32; break; 5134 case MVT::v16i8: NewVT = MVT::v4i32; break; 5135 } 5136 5137 int Scale = NumElems / NewWidth; 5138 SmallVector<int, 8> MaskVec; 5139 for (unsigned i = 0; i < NumElems; i += Scale) { 5140 int StartIdx = -1; 5141 for (int j = 0; j < Scale; ++j) { 5142 int EltIdx = SVOp->getMaskElt(i+j); 5143 if (EltIdx < 0) 5144 continue; 5145 if (StartIdx == -1) 5146 StartIdx = EltIdx - (EltIdx % Scale); 5147 if (EltIdx != StartIdx + j) 5148 return SDValue(); 5149 } 5150 if (StartIdx == -1) 5151 MaskVec.push_back(-1); 5152 else 5153 MaskVec.push_back(StartIdx / Scale); 5154 } 5155 5156 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5157 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5158 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5159} 5160 5161/// getVZextMovL - Return a zero-extending vector move low node. 5162/// 5163static SDValue getVZextMovL(EVT VT, EVT OpVT, 5164 SDValue SrcOp, SelectionDAG &DAG, 5165 const X86Subtarget *Subtarget, DebugLoc dl) { 5166 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5167 LoadSDNode *LD = NULL; 5168 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5169 LD = dyn_cast<LoadSDNode>(SrcOp); 5170 if (!LD) { 5171 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5172 // instead. 5173 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5174 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5175 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5176 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5177 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5178 // PR2108 5179 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5180 return DAG.getNode(ISD::BITCAST, dl, VT, 5181 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5182 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5183 OpVT, 5184 SrcOp.getOperand(0) 5185 .getOperand(0)))); 5186 } 5187 } 5188 } 5189 5190 return DAG.getNode(ISD::BITCAST, dl, VT, 5191 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5192 DAG.getNode(ISD::BITCAST, dl, 5193 OpVT, SrcOp))); 5194} 5195 5196/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 5197/// shuffles. 5198static SDValue 5199LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5200 SDValue V1 = SVOp->getOperand(0); 5201 SDValue V2 = SVOp->getOperand(1); 5202 DebugLoc dl = SVOp->getDebugLoc(); 5203 EVT VT = SVOp->getValueType(0); 5204 5205 SmallVector<std::pair<int, int>, 8> Locs; 5206 Locs.resize(4); 5207 SmallVector<int, 8> Mask1(4U, -1); 5208 SmallVector<int, 8> PermMask; 5209 SVOp->getMask(PermMask); 5210 5211 unsigned NumHi = 0; 5212 unsigned NumLo = 0; 5213 for (unsigned i = 0; i != 4; ++i) { 5214 int Idx = PermMask[i]; 5215 if (Idx < 0) { 5216 Locs[i] = std::make_pair(-1, -1); 5217 } else { 5218 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5219 if (Idx < 4) { 5220 Locs[i] = std::make_pair(0, NumLo); 5221 Mask1[NumLo] = Idx; 5222 NumLo++; 5223 } else { 5224 Locs[i] = std::make_pair(1, NumHi); 5225 if (2+NumHi < 4) 5226 Mask1[2+NumHi] = Idx; 5227 NumHi++; 5228 } 5229 } 5230 } 5231 5232 if (NumLo <= 2 && NumHi <= 2) { 5233 // If no more than two elements come from either vector. This can be 5234 // implemented with two shuffles. First shuffle gather the elements. 5235 // The second shuffle, which takes the first shuffle as both of its 5236 // vector operands, put the elements into the right order. 5237 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5238 5239 SmallVector<int, 8> Mask2(4U, -1); 5240 5241 for (unsigned i = 0; i != 4; ++i) { 5242 if (Locs[i].first == -1) 5243 continue; 5244 else { 5245 unsigned Idx = (i < 2) ? 0 : 4; 5246 Idx += Locs[i].first * 2 + Locs[i].second; 5247 Mask2[i] = Idx; 5248 } 5249 } 5250 5251 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5252 } else if (NumLo == 3 || NumHi == 3) { 5253 // Otherwise, we must have three elements from one vector, call it X, and 5254 // one element from the other, call it Y. First, use a shufps to build an 5255 // intermediate vector with the one element from Y and the element from X 5256 // that will be in the same half in the final destination (the indexes don't 5257 // matter). Then, use a shufps to build the final vector, taking the half 5258 // containing the element from Y from the intermediate, and the other half 5259 // from X. 5260 if (NumHi == 3) { 5261 // Normalize it so the 3 elements come from V1. 5262 CommuteVectorShuffleMask(PermMask, VT); 5263 std::swap(V1, V2); 5264 } 5265 5266 // Find the element from V2. 5267 unsigned HiIndex; 5268 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5269 int Val = PermMask[HiIndex]; 5270 if (Val < 0) 5271 continue; 5272 if (Val >= 4) 5273 break; 5274 } 5275 5276 Mask1[0] = PermMask[HiIndex]; 5277 Mask1[1] = -1; 5278 Mask1[2] = PermMask[HiIndex^1]; 5279 Mask1[3] = -1; 5280 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5281 5282 if (HiIndex >= 2) { 5283 Mask1[0] = PermMask[0]; 5284 Mask1[1] = PermMask[1]; 5285 Mask1[2] = HiIndex & 1 ? 6 : 4; 5286 Mask1[3] = HiIndex & 1 ? 4 : 6; 5287 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5288 } else { 5289 Mask1[0] = HiIndex & 1 ? 2 : 0; 5290 Mask1[1] = HiIndex & 1 ? 0 : 2; 5291 Mask1[2] = PermMask[2]; 5292 Mask1[3] = PermMask[3]; 5293 if (Mask1[2] >= 0) 5294 Mask1[2] += 4; 5295 if (Mask1[3] >= 0) 5296 Mask1[3] += 4; 5297 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5298 } 5299 } 5300 5301 // Break it into (shuffle shuffle_hi, shuffle_lo). 5302 Locs.clear(); 5303 Locs.resize(4); 5304 SmallVector<int,8> LoMask(4U, -1); 5305 SmallVector<int,8> HiMask(4U, -1); 5306 5307 SmallVector<int,8> *MaskPtr = &LoMask; 5308 unsigned MaskIdx = 0; 5309 unsigned LoIdx = 0; 5310 unsigned HiIdx = 2; 5311 for (unsigned i = 0; i != 4; ++i) { 5312 if (i == 2) { 5313 MaskPtr = &HiMask; 5314 MaskIdx = 1; 5315 LoIdx = 0; 5316 HiIdx = 2; 5317 } 5318 int Idx = PermMask[i]; 5319 if (Idx < 0) { 5320 Locs[i] = std::make_pair(-1, -1); 5321 } else if (Idx < 4) { 5322 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5323 (*MaskPtr)[LoIdx] = Idx; 5324 LoIdx++; 5325 } else { 5326 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5327 (*MaskPtr)[HiIdx] = Idx; 5328 HiIdx++; 5329 } 5330 } 5331 5332 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5333 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5334 SmallVector<int, 8> MaskOps; 5335 for (unsigned i = 0; i != 4; ++i) { 5336 if (Locs[i].first == -1) { 5337 MaskOps.push_back(-1); 5338 } else { 5339 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5340 MaskOps.push_back(Idx); 5341 } 5342 } 5343 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5344} 5345 5346static bool MayFoldVectorLoad(SDValue V) { 5347 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5348 V = V.getOperand(0); 5349 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5350 V = V.getOperand(0); 5351 if (MayFoldLoad(V)) 5352 return true; 5353 return false; 5354} 5355 5356// FIXME: the version above should always be used. Since there's 5357// a bug where several vector shuffles can't be folded because the 5358// DAG is not updated during lowering and a node claims to have two 5359// uses while it only has one, use this version, and let isel match 5360// another instruction if the load really happens to have more than 5361// one use. Remove this version after this bug get fixed. 5362// rdar://8434668, PR8156 5363static bool RelaxedMayFoldVectorLoad(SDValue V) { 5364 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5365 V = V.getOperand(0); 5366 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5367 V = V.getOperand(0); 5368 if (ISD::isNormalLoad(V.getNode())) 5369 return true; 5370 return false; 5371} 5372 5373/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5374/// a vector extract, and if both can be later optimized into a single load. 5375/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5376/// here because otherwise a target specific shuffle node is going to be 5377/// emitted for this shuffle, and the optimization not done. 5378/// FIXME: This is probably not the best approach, but fix the problem 5379/// until the right path is decided. 5380static 5381bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5382 const TargetLowering &TLI) { 5383 EVT VT = V.getValueType(); 5384 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5385 5386 // Be sure that the vector shuffle is present in a pattern like this: 5387 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5388 if (!V.hasOneUse()) 5389 return false; 5390 5391 SDNode *N = *V.getNode()->use_begin(); 5392 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5393 return false; 5394 5395 SDValue EltNo = N->getOperand(1); 5396 if (!isa<ConstantSDNode>(EltNo)) 5397 return false; 5398 5399 // If the bit convert changed the number of elements, it is unsafe 5400 // to examine the mask. 5401 bool HasShuffleIntoBitcast = false; 5402 if (V.getOpcode() == ISD::BITCAST) { 5403 EVT SrcVT = V.getOperand(0).getValueType(); 5404 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5405 return false; 5406 V = V.getOperand(0); 5407 HasShuffleIntoBitcast = true; 5408 } 5409 5410 // Select the input vector, guarding against out of range extract vector. 5411 unsigned NumElems = VT.getVectorNumElements(); 5412 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5413 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5414 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5415 5416 // Skip one more bit_convert if necessary 5417 if (V.getOpcode() == ISD::BITCAST) 5418 V = V.getOperand(0); 5419 5420 if (ISD::isNormalLoad(V.getNode())) { 5421 // Is the original load suitable? 5422 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5423 5424 // FIXME: avoid the multi-use bug that is preventing lots of 5425 // of foldings to be detected, this is still wrong of course, but 5426 // give the temporary desired behavior, and if it happens that 5427 // the load has real more uses, during isel it will not fold, and 5428 // will generate poor code. 5429 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5430 return false; 5431 5432 if (!HasShuffleIntoBitcast) 5433 return true; 5434 5435 // If there's a bitcast before the shuffle, check if the load type and 5436 // alignment is valid. 5437 unsigned Align = LN0->getAlignment(); 5438 unsigned NewAlign = 5439 TLI.getTargetData()->getABITypeAlignment( 5440 VT.getTypeForEVT(*DAG.getContext())); 5441 5442 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5443 return false; 5444 } 5445 5446 return true; 5447} 5448 5449static 5450SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5451 EVT VT = Op.getValueType(); 5452 5453 // Canonizalize to v2f64. 5454 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5455 return DAG.getNode(ISD::BITCAST, dl, VT, 5456 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5457 V1, DAG)); 5458} 5459 5460static 5461SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5462 bool HasSSE2) { 5463 SDValue V1 = Op.getOperand(0); 5464 SDValue V2 = Op.getOperand(1); 5465 EVT VT = Op.getValueType(); 5466 5467 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5468 5469 if (HasSSE2 && VT == MVT::v2f64) 5470 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5471 5472 // v4f32 or v4i32 5473 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5474} 5475 5476static 5477SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5478 SDValue V1 = Op.getOperand(0); 5479 SDValue V2 = Op.getOperand(1); 5480 EVT VT = Op.getValueType(); 5481 5482 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5483 "unsupported shuffle type"); 5484 5485 if (V2.getOpcode() == ISD::UNDEF) 5486 V2 = V1; 5487 5488 // v4i32 or v4f32 5489 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5490} 5491 5492static 5493SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5494 SDValue V1 = Op.getOperand(0); 5495 SDValue V2 = Op.getOperand(1); 5496 EVT VT = Op.getValueType(); 5497 unsigned NumElems = VT.getVectorNumElements(); 5498 5499 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5500 // operand of these instructions is only memory, so check if there's a 5501 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5502 // same masks. 5503 bool CanFoldLoad = false; 5504 5505 // Trivial case, when V2 comes from a load. 5506 if (MayFoldVectorLoad(V2)) 5507 CanFoldLoad = true; 5508 5509 // When V1 is a load, it can be folded later into a store in isel, example: 5510 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5511 // turns into: 5512 // (MOVLPSmr addr:$src1, VR128:$src2) 5513 // So, recognize this potential and also use MOVLPS or MOVLPD 5514 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5515 CanFoldLoad = true; 5516 5517 // Both of them can't be memory operations though. 5518 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 5519 CanFoldLoad = false; 5520 5521 if (CanFoldLoad) { 5522 if (HasSSE2 && NumElems == 2) 5523 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5524 5525 if (NumElems == 4) 5526 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5527 } 5528 5529 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5530 // movl and movlp will both match v2i64, but v2i64 is never matched by 5531 // movl earlier because we make it strict to avoid messing with the movlp load 5532 // folding logic (see the code above getMOVLP call). Match it here then, 5533 // this is horrible, but will stay like this until we move all shuffle 5534 // matching to x86 specific nodes. Note that for the 1st condition all 5535 // types are matched with movsd. 5536 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5537 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5538 else if (HasSSE2) 5539 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5540 5541 5542 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5543 5544 // Invert the operand order and use SHUFPS to match it. 5545 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5546 X86::getShuffleSHUFImmediate(SVOp), DAG); 5547} 5548 5549static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) { 5550 switch(VT.getSimpleVT().SimpleTy) { 5551 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5552 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5553 case MVT::v4f32: 5554 return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS; 5555 case MVT::v2f64: 5556 return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; 5557 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 5558 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 5559 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5560 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5561 default: 5562 llvm_unreachable("Unknown type for unpckl"); 5563 } 5564 return 0; 5565} 5566 5567static inline unsigned getUNPCKHOpcode(EVT VT) { 5568 switch(VT.getSimpleVT().SimpleTy) { 5569 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5570 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5571 case MVT::v4f32: return X86ISD::UNPCKHPS; 5572 case MVT::v2f64: return X86ISD::UNPCKHPD; 5573 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5574 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5575 default: 5576 llvm_unreachable("Unknown type for unpckh"); 5577 } 5578 return 0; 5579} 5580 5581static 5582SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5583 const TargetLowering &TLI, 5584 const X86Subtarget *Subtarget) { 5585 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5586 EVT VT = Op.getValueType(); 5587 DebugLoc dl = Op.getDebugLoc(); 5588 SDValue V1 = Op.getOperand(0); 5589 SDValue V2 = Op.getOperand(1); 5590 5591 if (isZeroShuffle(SVOp)) 5592 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5593 5594 // Handle splat operations 5595 if (SVOp->isSplat()) { 5596 // Special case, this is the only place now where it's 5597 // allowed to return a vector_shuffle operation without 5598 // using a target specific node, because *hopefully* it 5599 // will be optimized away by the dag combiner. 5600 if (VT.getVectorNumElements() <= 4 && 5601 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5602 return Op; 5603 5604 // Handle splats by matching through known masks 5605 if (VT.getVectorNumElements() <= 4) 5606 return SDValue(); 5607 5608 // Canonicalize all of the remaining to v4f32. 5609 return PromoteSplat(SVOp, DAG); 5610 } 5611 5612 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5613 // do it! 5614 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5615 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5616 if (NewOp.getNode()) 5617 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5618 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5619 // FIXME: Figure out a cleaner way to do this. 5620 // Try to make use of movq to zero out the top part. 5621 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5622 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5623 if (NewOp.getNode()) { 5624 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5625 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5626 DAG, Subtarget, dl); 5627 } 5628 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5629 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5630 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5631 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5632 DAG, Subtarget, dl); 5633 } 5634 } 5635 return SDValue(); 5636} 5637 5638SDValue 5639X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5640 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5641 SDValue V1 = Op.getOperand(0); 5642 SDValue V2 = Op.getOperand(1); 5643 EVT VT = Op.getValueType(); 5644 DebugLoc dl = Op.getDebugLoc(); 5645 unsigned NumElems = VT.getVectorNumElements(); 5646 bool isMMX = VT.getSizeInBits() == 64; 5647 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5648 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5649 bool V1IsSplat = false; 5650 bool V2IsSplat = false; 5651 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5652 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5653 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5654 MachineFunction &MF = DAG.getMachineFunction(); 5655 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5656 5657 // Shuffle operations on MMX not supported. 5658 if (isMMX) 5659 return Op; 5660 5661 // Vector shuffle lowering takes 3 steps: 5662 // 5663 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5664 // narrowing and commutation of operands should be handled. 5665 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5666 // shuffle nodes. 5667 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5668 // so the shuffle can be broken into other shuffles and the legalizer can 5669 // try the lowering again. 5670 // 5671 // The general ideia is that no vector_shuffle operation should be left to 5672 // be matched during isel, all of them must be converted to a target specific 5673 // node here. 5674 5675 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5676 // narrowing and commutation of operands should be handled. The actual code 5677 // doesn't include all of those, work in progress... 5678 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5679 if (NewOp.getNode()) 5680 return NewOp; 5681 5682 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5683 // unpckh_undef). Only use pshufd if speed is more important than size. 5684 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5685 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5686 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG); 5687 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5688 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5689 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5690 5691 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5692 RelaxedMayFoldVectorLoad(V1)) 5693 return getMOVDDup(Op, dl, V1, DAG); 5694 5695 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5696 return getMOVHighToLow(Op, dl, DAG); 5697 5698 // Use to match splats 5699 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5700 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5701 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5702 5703 if (X86::isPSHUFDMask(SVOp)) { 5704 // The actual implementation will match the mask in the if above and then 5705 // during isel it can match several different instructions, not only pshufd 5706 // as its name says, sad but true, emulate the behavior for now... 5707 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5708 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5709 5710 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5711 5712 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5713 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5714 5715 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5716 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5717 TargetMask, DAG); 5718 5719 if (VT == MVT::v4f32) 5720 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5721 TargetMask, DAG); 5722 } 5723 5724 // Check if this can be converted into a logical shift. 5725 bool isLeft = false; 5726 unsigned ShAmt = 0; 5727 SDValue ShVal; 5728 bool isShift = getSubtarget()->hasSSE2() && 5729 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5730 if (isShift && ShVal.hasOneUse()) { 5731 // If the shifted value has multiple uses, it may be cheaper to use 5732 // v_set0 + movlhps or movhlps, etc. 5733 EVT EltVT = VT.getVectorElementType(); 5734 ShAmt *= EltVT.getSizeInBits(); 5735 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5736 } 5737 5738 if (X86::isMOVLMask(SVOp)) { 5739 if (V1IsUndef) 5740 return V2; 5741 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5742 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5743 if (!X86::isMOVLPMask(SVOp)) { 5744 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5745 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5746 5747 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5748 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5749 } 5750 } 5751 5752 // FIXME: fold these into legal mask. 5753 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5754 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5755 5756 if (X86::isMOVHLPSMask(SVOp)) 5757 return getMOVHighToLow(Op, dl, DAG); 5758 5759 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5760 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5761 5762 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5763 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5764 5765 if (X86::isMOVLPMask(SVOp)) 5766 return getMOVLP(Op, dl, DAG, HasSSE2); 5767 5768 if (ShouldXformToMOVHLPS(SVOp) || 5769 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5770 return CommuteVectorShuffle(SVOp, DAG); 5771 5772 if (isShift) { 5773 // No better options. Use a vshl / vsrl. 5774 EVT EltVT = VT.getVectorElementType(); 5775 ShAmt *= EltVT.getSizeInBits(); 5776 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5777 } 5778 5779 bool Commuted = false; 5780 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5781 // 1,1,1,1 -> v8i16 though. 5782 V1IsSplat = isSplatVector(V1.getNode()); 5783 V2IsSplat = isSplatVector(V2.getNode()); 5784 5785 // Canonicalize the splat or undef, if present, to be on the RHS. 5786 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5787 Op = CommuteVectorShuffle(SVOp, DAG); 5788 SVOp = cast<ShuffleVectorSDNode>(Op); 5789 V1 = SVOp->getOperand(0); 5790 V2 = SVOp->getOperand(1); 5791 std::swap(V1IsSplat, V2IsSplat); 5792 std::swap(V1IsUndef, V2IsUndef); 5793 Commuted = true; 5794 } 5795 5796 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5797 // Shuffling low element of v1 into undef, just return v1. 5798 if (V2IsUndef) 5799 return V1; 5800 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5801 // the instruction selector will not match, so get a canonical MOVL with 5802 // swapped operands to undo the commute. 5803 return getMOVL(DAG, dl, VT, V2, V1); 5804 } 5805 5806 if (X86::isUNPCKLMask(SVOp)) 5807 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5808 dl, VT, V1, V2, DAG); 5809 5810 if (X86::isUNPCKHMask(SVOp)) 5811 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5812 5813 if (V2IsSplat) { 5814 // Normalize mask so all entries that point to V2 points to its first 5815 // element then try to match unpck{h|l} again. If match, return a 5816 // new vector_shuffle with the corrected mask. 5817 SDValue NewMask = NormalizeMask(SVOp, DAG); 5818 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5819 if (NSVOp != SVOp) { 5820 if (X86::isUNPCKLMask(NSVOp, true)) { 5821 return NewMask; 5822 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5823 return NewMask; 5824 } 5825 } 5826 } 5827 5828 if (Commuted) { 5829 // Commute is back and try unpck* again. 5830 // FIXME: this seems wrong. 5831 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5832 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5833 5834 if (X86::isUNPCKLMask(NewSVOp)) 5835 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5836 dl, VT, V2, V1, DAG); 5837 5838 if (X86::isUNPCKHMask(NewSVOp)) 5839 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5840 } 5841 5842 // Normalize the node to match x86 shuffle ops if needed 5843 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5844 return CommuteVectorShuffle(SVOp, DAG); 5845 5846 // The checks below are all present in isShuffleMaskLegal, but they are 5847 // inlined here right now to enable us to directly emit target specific 5848 // nodes, and remove one by one until they don't return Op anymore. 5849 SmallVector<int, 16> M; 5850 SVOp->getMask(M); 5851 5852 if (isPALIGNRMask(M, VT, HasSSSE3)) 5853 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5854 X86::getShufflePALIGNRImmediate(SVOp), 5855 DAG); 5856 5857 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5858 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5859 if (VT == MVT::v2f64) { 5860 X86ISD::NodeType Opcode = 5861 getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; 5862 return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG); 5863 } 5864 if (VT == MVT::v2i64) 5865 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5866 } 5867 5868 if (isPSHUFHWMask(M, VT)) 5869 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5870 X86::getShufflePSHUFHWImmediate(SVOp), 5871 DAG); 5872 5873 if (isPSHUFLWMask(M, VT)) 5874 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5875 X86::getShufflePSHUFLWImmediate(SVOp), 5876 DAG); 5877 5878 if (isSHUFPMask(M, VT)) { 5879 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5880 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5881 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5882 TargetMask, DAG); 5883 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5884 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5885 TargetMask, DAG); 5886 } 5887 5888 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5889 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5890 return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), 5891 dl, VT, V1, V1, DAG); 5892 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5893 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5894 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5895 5896 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5897 if (VT == MVT::v8i16) { 5898 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5899 if (NewOp.getNode()) 5900 return NewOp; 5901 } 5902 5903 if (VT == MVT::v16i8) { 5904 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5905 if (NewOp.getNode()) 5906 return NewOp; 5907 } 5908 5909 // Handle all 4 wide cases with a number of shuffles. 5910 if (NumElems == 4) 5911 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5912 5913 return SDValue(); 5914} 5915 5916SDValue 5917X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5918 SelectionDAG &DAG) const { 5919 EVT VT = Op.getValueType(); 5920 DebugLoc dl = Op.getDebugLoc(); 5921 if (VT.getSizeInBits() == 8) { 5922 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5923 Op.getOperand(0), Op.getOperand(1)); 5924 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5925 DAG.getValueType(VT)); 5926 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5927 } else if (VT.getSizeInBits() == 16) { 5928 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5929 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5930 if (Idx == 0) 5931 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5932 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5933 DAG.getNode(ISD::BITCAST, dl, 5934 MVT::v4i32, 5935 Op.getOperand(0)), 5936 Op.getOperand(1))); 5937 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5938 Op.getOperand(0), Op.getOperand(1)); 5939 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5940 DAG.getValueType(VT)); 5941 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5942 } else if (VT == MVT::f32) { 5943 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5944 // the result back to FR32 register. It's only worth matching if the 5945 // result has a single use which is a store or a bitcast to i32. And in 5946 // the case of a store, it's not worth it if the index is a constant 0, 5947 // because a MOVSSmr can be used instead, which is smaller and faster. 5948 if (!Op.hasOneUse()) 5949 return SDValue(); 5950 SDNode *User = *Op.getNode()->use_begin(); 5951 if ((User->getOpcode() != ISD::STORE || 5952 (isa<ConstantSDNode>(Op.getOperand(1)) && 5953 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5954 (User->getOpcode() != ISD::BITCAST || 5955 User->getValueType(0) != MVT::i32)) 5956 return SDValue(); 5957 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5958 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 5959 Op.getOperand(0)), 5960 Op.getOperand(1)); 5961 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 5962 } else if (VT == MVT::i32) { 5963 // ExtractPS works with constant index. 5964 if (isa<ConstantSDNode>(Op.getOperand(1))) 5965 return Op; 5966 } 5967 return SDValue(); 5968} 5969 5970 5971SDValue 5972X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5973 SelectionDAG &DAG) const { 5974 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5975 return SDValue(); 5976 5977 SDValue Vec = Op.getOperand(0); 5978 EVT VecVT = Vec.getValueType(); 5979 5980 // If this is a 256-bit vector result, first extract the 128-bit 5981 // vector and then extract from the 128-bit vector. 5982 if (VecVT.getSizeInBits() > 128) { 5983 DebugLoc dl = Op.getNode()->getDebugLoc(); 5984 unsigned NumElems = VecVT.getVectorNumElements(); 5985 SDValue Idx = Op.getOperand(1); 5986 5987 if (!isa<ConstantSDNode>(Idx)) 5988 return SDValue(); 5989 5990 unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128); 5991 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 5992 5993 // Get the 128-bit vector. 5994 bool Upper = IdxVal >= ExtractNumElems; 5995 Vec = Extract128BitVector(Vec, Idx, DAG, dl); 5996 5997 // Extract from it. 5998 SDValue ScaledIdx = Idx; 5999 if (Upper) 6000 ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx, 6001 DAG.getConstant(ExtractNumElems, 6002 Idx.getValueType())); 6003 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6004 ScaledIdx); 6005 } 6006 6007 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6008 6009 if (Subtarget->hasSSE41()) { 6010 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6011 if (Res.getNode()) 6012 return Res; 6013 } 6014 6015 EVT VT = Op.getValueType(); 6016 DebugLoc dl = Op.getDebugLoc(); 6017 // TODO: handle v16i8. 6018 if (VT.getSizeInBits() == 16) { 6019 SDValue Vec = Op.getOperand(0); 6020 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6021 if (Idx == 0) 6022 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6023 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6024 DAG.getNode(ISD::BITCAST, dl, 6025 MVT::v4i32, Vec), 6026 Op.getOperand(1))); 6027 // Transform it so it match pextrw which produces a 32-bit result. 6028 EVT EltVT = MVT::i32; 6029 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6030 Op.getOperand(0), Op.getOperand(1)); 6031 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6032 DAG.getValueType(VT)); 6033 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6034 } else if (VT.getSizeInBits() == 32) { 6035 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6036 if (Idx == 0) 6037 return Op; 6038 6039 // SHUFPS the element to the lowest double word, then movss. 6040 int Mask[4] = { Idx, -1, -1, -1 }; 6041 EVT VVT = Op.getOperand(0).getValueType(); 6042 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6043 DAG.getUNDEF(VVT), Mask); 6044 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6045 DAG.getIntPtrConstant(0)); 6046 } else if (VT.getSizeInBits() == 64) { 6047 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6048 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6049 // to match extract_elt for f64. 6050 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6051 if (Idx == 0) 6052 return Op; 6053 6054 // UNPCKHPD the element to the lowest double word, then movsd. 6055 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6056 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6057 int Mask[2] = { 1, -1 }; 6058 EVT VVT = Op.getOperand(0).getValueType(); 6059 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6060 DAG.getUNDEF(VVT), Mask); 6061 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6062 DAG.getIntPtrConstant(0)); 6063 } 6064 6065 return SDValue(); 6066} 6067 6068SDValue 6069X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6070 SelectionDAG &DAG) const { 6071 EVT VT = Op.getValueType(); 6072 EVT EltVT = VT.getVectorElementType(); 6073 DebugLoc dl = Op.getDebugLoc(); 6074 6075 SDValue N0 = Op.getOperand(0); 6076 SDValue N1 = Op.getOperand(1); 6077 SDValue N2 = Op.getOperand(2); 6078 6079 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6080 isa<ConstantSDNode>(N2)) { 6081 unsigned Opc; 6082 if (VT == MVT::v8i16) 6083 Opc = X86ISD::PINSRW; 6084 else if (VT == MVT::v16i8) 6085 Opc = X86ISD::PINSRB; 6086 else 6087 Opc = X86ISD::PINSRB; 6088 6089 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6090 // argument. 6091 if (N1.getValueType() != MVT::i32) 6092 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6093 if (N2.getValueType() != MVT::i32) 6094 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6095 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6096 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6097 // Bits [7:6] of the constant are the source select. This will always be 6098 // zero here. The DAG Combiner may combine an extract_elt index into these 6099 // bits. For example (insert (extract, 3), 2) could be matched by putting 6100 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6101 // Bits [5:4] of the constant are the destination select. This is the 6102 // value of the incoming immediate. 6103 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6104 // combine either bitwise AND or insert of float 0.0 to set these bits. 6105 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6106 // Create this as a scalar to vector.. 6107 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6108 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6109 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6110 // PINSR* works with constant index. 6111 return Op; 6112 } 6113 return SDValue(); 6114} 6115 6116SDValue 6117X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6118 EVT VT = Op.getValueType(); 6119 EVT EltVT = VT.getVectorElementType(); 6120 6121 DebugLoc dl = Op.getDebugLoc(); 6122 SDValue N0 = Op.getOperand(0); 6123 SDValue N1 = Op.getOperand(1); 6124 SDValue N2 = Op.getOperand(2); 6125 6126 // If this is a 256-bit vector result, first insert into a 128-bit 6127 // vector and then insert into the 256-bit vector. 6128 if (VT.getSizeInBits() > 128) { 6129 if (!isa<ConstantSDNode>(N2)) 6130 return SDValue(); 6131 6132 // Get the 128-bit vector. 6133 unsigned NumElems = VT.getVectorNumElements(); 6134 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6135 bool Upper = IdxVal >= NumElems / 2; 6136 6137 SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl); 6138 6139 // Insert into it. 6140 SDValue ScaledN2 = N2; 6141 if (Upper) 6142 ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2, 6143 DAG.getConstant(NumElems / 6144 (VT.getSizeInBits() / 128), 6145 N2.getValueType())); 6146 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0, 6147 N1, ScaledN2); 6148 6149 // Insert the 128-bit vector 6150 // FIXME: Why UNDEF? 6151 return Insert128BitVector(N0, Op, N2, DAG, dl); 6152 } 6153 6154 if (Subtarget->hasSSE41()) 6155 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6156 6157 if (EltVT == MVT::i8) 6158 return SDValue(); 6159 6160 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6161 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6162 // as its second argument. 6163 if (N1.getValueType() != MVT::i32) 6164 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6165 if (N2.getValueType() != MVT::i32) 6166 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6167 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6168 } 6169 return SDValue(); 6170} 6171 6172SDValue 6173X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6174 LLVMContext *Context = DAG.getContext(); 6175 DebugLoc dl = Op.getDebugLoc(); 6176 EVT OpVT = Op.getValueType(); 6177 6178 // If this is a 256-bit vector result, first insert into a 128-bit 6179 // vector and then insert into the 256-bit vector. 6180 if (OpVT.getSizeInBits() > 128) { 6181 // Insert into a 128-bit vector. 6182 EVT VT128 = EVT::getVectorVT(*Context, 6183 OpVT.getVectorElementType(), 6184 OpVT.getVectorNumElements() / 2); 6185 6186 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6187 6188 // Insert the 128-bit vector. 6189 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6190 DAG.getConstant(0, MVT::i32), 6191 DAG, dl); 6192 } 6193 6194 if (Op.getValueType() == MVT::v1i64 && 6195 Op.getOperand(0).getValueType() == MVT::i64) 6196 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6197 6198 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6199 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6200 "Expected an SSE type!"); 6201 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6202 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6203} 6204 6205// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6206// a simple subregister reference or explicit instructions to grab 6207// upper bits of a vector. 6208SDValue 6209X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6210 if (Subtarget->hasAVX()) { 6211 DebugLoc dl = Op.getNode()->getDebugLoc(); 6212 SDValue Vec = Op.getNode()->getOperand(0); 6213 SDValue Idx = Op.getNode()->getOperand(1); 6214 6215 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6216 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6217 return Extract128BitVector(Vec, Idx, DAG, dl); 6218 } 6219 } 6220 return SDValue(); 6221} 6222 6223// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6224// simple superregister reference or explicit instructions to insert 6225// the upper bits of a vector. 6226SDValue 6227X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6228 if (Subtarget->hasAVX()) { 6229 DebugLoc dl = Op.getNode()->getDebugLoc(); 6230 SDValue Vec = Op.getNode()->getOperand(0); 6231 SDValue SubVec = Op.getNode()->getOperand(1); 6232 SDValue Idx = Op.getNode()->getOperand(2); 6233 6234 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6235 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6236 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6237 } 6238 } 6239 return SDValue(); 6240} 6241 6242// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6243// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6244// one of the above mentioned nodes. It has to be wrapped because otherwise 6245// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6246// be used to form addressing mode. These wrapped nodes will be selected 6247// into MOV32ri. 6248SDValue 6249X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6250 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6251 6252 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6253 // global base reg. 6254 unsigned char OpFlag = 0; 6255 unsigned WrapperKind = X86ISD::Wrapper; 6256 CodeModel::Model M = getTargetMachine().getCodeModel(); 6257 6258 if (Subtarget->isPICStyleRIPRel() && 6259 (M == CodeModel::Small || M == CodeModel::Kernel)) 6260 WrapperKind = X86ISD::WrapperRIP; 6261 else if (Subtarget->isPICStyleGOT()) 6262 OpFlag = X86II::MO_GOTOFF; 6263 else if (Subtarget->isPICStyleStubPIC()) 6264 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6265 6266 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6267 CP->getAlignment(), 6268 CP->getOffset(), OpFlag); 6269 DebugLoc DL = CP->getDebugLoc(); 6270 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6271 // With PIC, the address is actually $g + Offset. 6272 if (OpFlag) { 6273 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6274 DAG.getNode(X86ISD::GlobalBaseReg, 6275 DebugLoc(), getPointerTy()), 6276 Result); 6277 } 6278 6279 return Result; 6280} 6281 6282SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6283 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6284 6285 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6286 // global base reg. 6287 unsigned char OpFlag = 0; 6288 unsigned WrapperKind = X86ISD::Wrapper; 6289 CodeModel::Model M = getTargetMachine().getCodeModel(); 6290 6291 if (Subtarget->isPICStyleRIPRel() && 6292 (M == CodeModel::Small || M == CodeModel::Kernel)) 6293 WrapperKind = X86ISD::WrapperRIP; 6294 else if (Subtarget->isPICStyleGOT()) 6295 OpFlag = X86II::MO_GOTOFF; 6296 else if (Subtarget->isPICStyleStubPIC()) 6297 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6298 6299 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6300 OpFlag); 6301 DebugLoc DL = JT->getDebugLoc(); 6302 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6303 6304 // With PIC, the address is actually $g + Offset. 6305 if (OpFlag) 6306 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6307 DAG.getNode(X86ISD::GlobalBaseReg, 6308 DebugLoc(), getPointerTy()), 6309 Result); 6310 6311 return Result; 6312} 6313 6314SDValue 6315X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6316 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6317 6318 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6319 // global base reg. 6320 unsigned char OpFlag = 0; 6321 unsigned WrapperKind = X86ISD::Wrapper; 6322 CodeModel::Model M = getTargetMachine().getCodeModel(); 6323 6324 if (Subtarget->isPICStyleRIPRel() && 6325 (M == CodeModel::Small || M == CodeModel::Kernel)) 6326 WrapperKind = X86ISD::WrapperRIP; 6327 else if (Subtarget->isPICStyleGOT()) 6328 OpFlag = X86II::MO_GOTOFF; 6329 else if (Subtarget->isPICStyleStubPIC()) 6330 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6331 6332 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6333 6334 DebugLoc DL = Op.getDebugLoc(); 6335 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6336 6337 6338 // With PIC, the address is actually $g + Offset. 6339 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6340 !Subtarget->is64Bit()) { 6341 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6342 DAG.getNode(X86ISD::GlobalBaseReg, 6343 DebugLoc(), getPointerTy()), 6344 Result); 6345 } 6346 6347 return Result; 6348} 6349 6350SDValue 6351X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6352 // Create the TargetBlockAddressAddress node. 6353 unsigned char OpFlags = 6354 Subtarget->ClassifyBlockAddressReference(); 6355 CodeModel::Model M = getTargetMachine().getCodeModel(); 6356 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6357 DebugLoc dl = Op.getDebugLoc(); 6358 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6359 /*isTarget=*/true, OpFlags); 6360 6361 if (Subtarget->isPICStyleRIPRel() && 6362 (M == CodeModel::Small || M == CodeModel::Kernel)) 6363 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6364 else 6365 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6366 6367 // With PIC, the address is actually $g + Offset. 6368 if (isGlobalRelativeToPICBase(OpFlags)) { 6369 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6370 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6371 Result); 6372 } 6373 6374 return Result; 6375} 6376 6377SDValue 6378X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6379 int64_t Offset, 6380 SelectionDAG &DAG) const { 6381 // Create the TargetGlobalAddress node, folding in the constant 6382 // offset if it is legal. 6383 unsigned char OpFlags = 6384 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6385 CodeModel::Model M = getTargetMachine().getCodeModel(); 6386 SDValue Result; 6387 if (OpFlags == X86II::MO_NO_FLAG && 6388 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6389 // A direct static reference to a global. 6390 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6391 Offset = 0; 6392 } else { 6393 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6394 } 6395 6396 if (Subtarget->isPICStyleRIPRel() && 6397 (M == CodeModel::Small || M == CodeModel::Kernel)) 6398 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6399 else 6400 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6401 6402 // With PIC, the address is actually $g + Offset. 6403 if (isGlobalRelativeToPICBase(OpFlags)) { 6404 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6405 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6406 Result); 6407 } 6408 6409 // For globals that require a load from a stub to get the address, emit the 6410 // load. 6411 if (isGlobalStubReference(OpFlags)) 6412 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6413 MachinePointerInfo::getGOT(), false, false, 0); 6414 6415 // If there was a non-zero offset that we didn't fold, create an explicit 6416 // addition for it. 6417 if (Offset != 0) 6418 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6419 DAG.getConstant(Offset, getPointerTy())); 6420 6421 return Result; 6422} 6423 6424SDValue 6425X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6426 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6427 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6428 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6429} 6430 6431static SDValue 6432GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6433 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6434 unsigned char OperandFlags) { 6435 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6436 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6437 DebugLoc dl = GA->getDebugLoc(); 6438 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6439 GA->getValueType(0), 6440 GA->getOffset(), 6441 OperandFlags); 6442 if (InFlag) { 6443 SDValue Ops[] = { Chain, TGA, *InFlag }; 6444 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6445 } else { 6446 SDValue Ops[] = { Chain, TGA }; 6447 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6448 } 6449 6450 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6451 MFI->setAdjustsStack(true); 6452 6453 SDValue Flag = Chain.getValue(1); 6454 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6455} 6456 6457// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6458static SDValue 6459LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6460 const EVT PtrVT) { 6461 SDValue InFlag; 6462 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6463 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6464 DAG.getNode(X86ISD::GlobalBaseReg, 6465 DebugLoc(), PtrVT), InFlag); 6466 InFlag = Chain.getValue(1); 6467 6468 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6469} 6470 6471// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6472static SDValue 6473LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6474 const EVT PtrVT) { 6475 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6476 X86::RAX, X86II::MO_TLSGD); 6477} 6478 6479// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6480// "local exec" model. 6481static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6482 const EVT PtrVT, TLSModel::Model model, 6483 bool is64Bit) { 6484 DebugLoc dl = GA->getDebugLoc(); 6485 6486 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6487 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6488 is64Bit ? 257 : 256)); 6489 6490 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6491 DAG.getIntPtrConstant(0), 6492 MachinePointerInfo(Ptr), false, false, 0); 6493 6494 unsigned char OperandFlags = 0; 6495 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6496 // initialexec. 6497 unsigned WrapperKind = X86ISD::Wrapper; 6498 if (model == TLSModel::LocalExec) { 6499 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6500 } else if (is64Bit) { 6501 assert(model == TLSModel::InitialExec); 6502 OperandFlags = X86II::MO_GOTTPOFF; 6503 WrapperKind = X86ISD::WrapperRIP; 6504 } else { 6505 assert(model == TLSModel::InitialExec); 6506 OperandFlags = X86II::MO_INDNTPOFF; 6507 } 6508 6509 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6510 // exec) 6511 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6512 GA->getValueType(0), 6513 GA->getOffset(), OperandFlags); 6514 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6515 6516 if (model == TLSModel::InitialExec) 6517 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6518 MachinePointerInfo::getGOT(), false, false, 0); 6519 6520 // The address of the thread local variable is the add of the thread 6521 // pointer with the offset of the variable. 6522 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6523} 6524 6525SDValue 6526X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6527 6528 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6529 const GlobalValue *GV = GA->getGlobal(); 6530 6531 if (Subtarget->isTargetELF()) { 6532 // TODO: implement the "local dynamic" model 6533 // TODO: implement the "initial exec"model for pic executables 6534 6535 // If GV is an alias then use the aliasee for determining 6536 // thread-localness. 6537 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6538 GV = GA->resolveAliasedGlobal(false); 6539 6540 TLSModel::Model model 6541 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6542 6543 switch (model) { 6544 case TLSModel::GeneralDynamic: 6545 case TLSModel::LocalDynamic: // not implemented 6546 if (Subtarget->is64Bit()) 6547 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6548 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6549 6550 case TLSModel::InitialExec: 6551 case TLSModel::LocalExec: 6552 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6553 Subtarget->is64Bit()); 6554 } 6555 } else if (Subtarget->isTargetDarwin()) { 6556 // Darwin only has one model of TLS. Lower to that. 6557 unsigned char OpFlag = 0; 6558 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6559 X86ISD::WrapperRIP : X86ISD::Wrapper; 6560 6561 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6562 // global base reg. 6563 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6564 !Subtarget->is64Bit(); 6565 if (PIC32) 6566 OpFlag = X86II::MO_TLVP_PIC_BASE; 6567 else 6568 OpFlag = X86II::MO_TLVP; 6569 DebugLoc DL = Op.getDebugLoc(); 6570 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6571 GA->getValueType(0), 6572 GA->getOffset(), OpFlag); 6573 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6574 6575 // With PIC32, the address is actually $g + Offset. 6576 if (PIC32) 6577 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6578 DAG.getNode(X86ISD::GlobalBaseReg, 6579 DebugLoc(), getPointerTy()), 6580 Offset); 6581 6582 // Lowering the machine isd will make sure everything is in the right 6583 // location. 6584 SDValue Chain = DAG.getEntryNode(); 6585 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6586 SDValue Args[] = { Chain, Offset }; 6587 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6588 6589 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6590 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6591 MFI->setAdjustsStack(true); 6592 6593 // And our return value (tls address) is in the standard call return value 6594 // location. 6595 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6596 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6597 } 6598 6599 assert(false && 6600 "TLS not implemented for this target."); 6601 6602 llvm_unreachable("Unreachable"); 6603 return SDValue(); 6604} 6605 6606 6607/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6608/// take a 2 x i32 value to shift plus a shift amount. 6609SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6610 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6611 EVT VT = Op.getValueType(); 6612 unsigned VTBits = VT.getSizeInBits(); 6613 DebugLoc dl = Op.getDebugLoc(); 6614 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6615 SDValue ShOpLo = Op.getOperand(0); 6616 SDValue ShOpHi = Op.getOperand(1); 6617 SDValue ShAmt = Op.getOperand(2); 6618 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6619 DAG.getConstant(VTBits - 1, MVT::i8)) 6620 : DAG.getConstant(0, VT); 6621 6622 SDValue Tmp2, Tmp3; 6623 if (Op.getOpcode() == ISD::SHL_PARTS) { 6624 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6625 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6626 } else { 6627 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6628 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6629 } 6630 6631 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6632 DAG.getConstant(VTBits, MVT::i8)); 6633 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6634 AndNode, DAG.getConstant(0, MVT::i8)); 6635 6636 SDValue Hi, Lo; 6637 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6638 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6639 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6640 6641 if (Op.getOpcode() == ISD::SHL_PARTS) { 6642 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6643 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6644 } else { 6645 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6646 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6647 } 6648 6649 SDValue Ops[2] = { Lo, Hi }; 6650 return DAG.getMergeValues(Ops, 2, dl); 6651} 6652 6653SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6654 SelectionDAG &DAG) const { 6655 EVT SrcVT = Op.getOperand(0).getValueType(); 6656 6657 if (SrcVT.isVector()) 6658 return SDValue(); 6659 6660 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6661 "Unknown SINT_TO_FP to lower!"); 6662 6663 // These are really Legal; return the operand so the caller accepts it as 6664 // Legal. 6665 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6666 return Op; 6667 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6668 Subtarget->is64Bit()) { 6669 return Op; 6670 } 6671 6672 DebugLoc dl = Op.getDebugLoc(); 6673 unsigned Size = SrcVT.getSizeInBits()/8; 6674 MachineFunction &MF = DAG.getMachineFunction(); 6675 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6676 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6677 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6678 StackSlot, 6679 MachinePointerInfo::getFixedStack(SSFI), 6680 false, false, 0); 6681 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6682} 6683 6684SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6685 SDValue StackSlot, 6686 SelectionDAG &DAG) const { 6687 // Build the FILD 6688 DebugLoc DL = Op.getDebugLoc(); 6689 SDVTList Tys; 6690 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6691 if (useSSE) 6692 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 6693 else 6694 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6695 6696 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6697 6698 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6699 MachineMemOperand *MMO = 6700 DAG.getMachineFunction() 6701 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6702 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6703 6704 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6705 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6706 X86ISD::FILD, DL, 6707 Tys, Ops, array_lengthof(Ops), 6708 SrcVT, MMO); 6709 6710 if (useSSE) { 6711 Chain = Result.getValue(1); 6712 SDValue InFlag = Result.getValue(2); 6713 6714 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6715 // shouldn't be necessary except that RFP cannot be live across 6716 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6717 MachineFunction &MF = DAG.getMachineFunction(); 6718 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6719 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6720 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6721 Tys = DAG.getVTList(MVT::Other); 6722 SDValue Ops[] = { 6723 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6724 }; 6725 MachineMemOperand *MMO = 6726 DAG.getMachineFunction() 6727 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6728 MachineMemOperand::MOStore, SSFISize, SSFISize); 6729 6730 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6731 Ops, array_lengthof(Ops), 6732 Op.getValueType(), MMO); 6733 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6734 MachinePointerInfo::getFixedStack(SSFI), 6735 false, false, 0); 6736 } 6737 6738 return Result; 6739} 6740 6741// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6742SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6743 SelectionDAG &DAG) const { 6744 // This algorithm is not obvious. Here it is in C code, more or less: 6745 /* 6746 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6747 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6748 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6749 6750 // Copy ints to xmm registers. 6751 __m128i xh = _mm_cvtsi32_si128( hi ); 6752 __m128i xl = _mm_cvtsi32_si128( lo ); 6753 6754 // Combine into low half of a single xmm register. 6755 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6756 __m128d d; 6757 double sd; 6758 6759 // Merge in appropriate exponents to give the integer bits the right 6760 // magnitude. 6761 x = _mm_unpacklo_epi32( x, exp ); 6762 6763 // Subtract away the biases to deal with the IEEE-754 double precision 6764 // implicit 1. 6765 d = _mm_sub_pd( (__m128d) x, bias ); 6766 6767 // All conversions up to here are exact. The correctly rounded result is 6768 // calculated using the current rounding mode using the following 6769 // horizontal add. 6770 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6771 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6772 // store doesn't really need to be here (except 6773 // maybe to zero the other double) 6774 return sd; 6775 } 6776 */ 6777 6778 DebugLoc dl = Op.getDebugLoc(); 6779 LLVMContext *Context = DAG.getContext(); 6780 6781 // Build some magic constants. 6782 std::vector<Constant*> CV0; 6783 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6784 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6785 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6786 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6787 Constant *C0 = ConstantVector::get(CV0); 6788 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6789 6790 std::vector<Constant*> CV1; 6791 CV1.push_back( 6792 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6793 CV1.push_back( 6794 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6795 Constant *C1 = ConstantVector::get(CV1); 6796 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6797 6798 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6799 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6800 Op.getOperand(0), 6801 DAG.getIntPtrConstant(1))); 6802 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6803 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6804 Op.getOperand(0), 6805 DAG.getIntPtrConstant(0))); 6806 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6807 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6808 MachinePointerInfo::getConstantPool(), 6809 false, false, 16); 6810 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6811 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 6812 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6813 MachinePointerInfo::getConstantPool(), 6814 false, false, 16); 6815 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6816 6817 // Add the halves; easiest way is to swap them into another reg first. 6818 int ShufMask[2] = { 1, -1 }; 6819 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6820 DAG.getUNDEF(MVT::v2f64), ShufMask); 6821 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6822 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6823 DAG.getIntPtrConstant(0)); 6824} 6825 6826// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6827SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6828 SelectionDAG &DAG) const { 6829 DebugLoc dl = Op.getDebugLoc(); 6830 // FP constant to bias correct the final result. 6831 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6832 MVT::f64); 6833 6834 // Load the 32-bit value into an XMM register. 6835 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6836 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6837 Op.getOperand(0), 6838 DAG.getIntPtrConstant(0))); 6839 6840 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6841 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 6842 DAG.getIntPtrConstant(0)); 6843 6844 // Or the load with the bias. 6845 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6846 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6847 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6848 MVT::v2f64, Load)), 6849 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6850 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6851 MVT::v2f64, Bias))); 6852 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6853 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 6854 DAG.getIntPtrConstant(0)); 6855 6856 // Subtract the bias. 6857 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6858 6859 // Handle final rounding. 6860 EVT DestVT = Op.getValueType(); 6861 6862 if (DestVT.bitsLT(MVT::f64)) { 6863 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6864 DAG.getIntPtrConstant(0)); 6865 } else if (DestVT.bitsGT(MVT::f64)) { 6866 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6867 } 6868 6869 // Handle final rounding. 6870 return Sub; 6871} 6872 6873SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6874 SelectionDAG &DAG) const { 6875 SDValue N0 = Op.getOperand(0); 6876 DebugLoc dl = Op.getDebugLoc(); 6877 6878 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6879 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6880 // the optimization here. 6881 if (DAG.SignBitIsZero(N0)) 6882 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6883 6884 EVT SrcVT = N0.getValueType(); 6885 EVT DstVT = Op.getValueType(); 6886 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6887 return LowerUINT_TO_FP_i64(Op, DAG); 6888 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6889 return LowerUINT_TO_FP_i32(Op, DAG); 6890 6891 // Make a 64-bit buffer, and use it to build an FILD. 6892 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6893 if (SrcVT == MVT::i32) { 6894 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6895 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6896 getPointerTy(), StackSlot, WordOff); 6897 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6898 StackSlot, MachinePointerInfo(), 6899 false, false, 0); 6900 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6901 OffsetSlot, MachinePointerInfo(), 6902 false, false, 0); 6903 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6904 return Fild; 6905 } 6906 6907 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6908 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6909 StackSlot, MachinePointerInfo(), 6910 false, false, 0); 6911 // For i64 source, we need to add the appropriate power of 2 if the input 6912 // was negative. This is the same as the optimization in 6913 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6914 // we must be careful to do the computation in x87 extended precision, not 6915 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6916 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6917 MachineMemOperand *MMO = 6918 DAG.getMachineFunction() 6919 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6920 MachineMemOperand::MOLoad, 8, 8); 6921 6922 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6923 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6924 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6925 MVT::i64, MMO); 6926 6927 APInt FF(32, 0x5F800000ULL); 6928 6929 // Check whether the sign bit is set. 6930 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6931 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6932 ISD::SETLT); 6933 6934 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6935 SDValue FudgePtr = DAG.getConstantPool( 6936 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6937 getPointerTy()); 6938 6939 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6940 SDValue Zero = DAG.getIntPtrConstant(0); 6941 SDValue Four = DAG.getIntPtrConstant(4); 6942 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6943 Zero, Four); 6944 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6945 6946 // Load the value out, extending it from f32 to f80. 6947 // FIXME: Avoid the extend by constructing the right constant pool? 6948 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 6949 FudgePtr, MachinePointerInfo::getConstantPool(), 6950 MVT::f32, false, false, 4); 6951 // Extend everything to 80 bits to force it to be done on x87. 6952 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6953 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6954} 6955 6956std::pair<SDValue,SDValue> X86TargetLowering:: 6957FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6958 DebugLoc DL = Op.getDebugLoc(); 6959 6960 EVT DstTy = Op.getValueType(); 6961 6962 if (!IsSigned) { 6963 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6964 DstTy = MVT::i64; 6965 } 6966 6967 assert(DstTy.getSimpleVT() <= MVT::i64 && 6968 DstTy.getSimpleVT() >= MVT::i16 && 6969 "Unknown FP_TO_SINT to lower!"); 6970 6971 // These are really Legal. 6972 if (DstTy == MVT::i32 && 6973 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6974 return std::make_pair(SDValue(), SDValue()); 6975 if (Subtarget->is64Bit() && 6976 DstTy == MVT::i64 && 6977 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6978 return std::make_pair(SDValue(), SDValue()); 6979 6980 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6981 // stack slot. 6982 MachineFunction &MF = DAG.getMachineFunction(); 6983 unsigned MemSize = DstTy.getSizeInBits()/8; 6984 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6985 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6986 6987 6988 6989 unsigned Opc; 6990 switch (DstTy.getSimpleVT().SimpleTy) { 6991 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6992 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6993 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6994 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6995 } 6996 6997 SDValue Chain = DAG.getEntryNode(); 6998 SDValue Value = Op.getOperand(0); 6999 EVT TheVT = Op.getOperand(0).getValueType(); 7000 if (isScalarFPTypeInSSEReg(TheVT)) { 7001 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7002 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7003 MachinePointerInfo::getFixedStack(SSFI), 7004 false, false, 0); 7005 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7006 SDValue Ops[] = { 7007 Chain, StackSlot, DAG.getValueType(TheVT) 7008 }; 7009 7010 MachineMemOperand *MMO = 7011 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7012 MachineMemOperand::MOLoad, MemSize, MemSize); 7013 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7014 DstTy, MMO); 7015 Chain = Value.getValue(1); 7016 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7017 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7018 } 7019 7020 MachineMemOperand *MMO = 7021 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7022 MachineMemOperand::MOStore, MemSize, MemSize); 7023 7024 // Build the FP_TO_INT*_IN_MEM 7025 SDValue Ops[] = { Chain, Value, StackSlot }; 7026 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7027 Ops, 3, DstTy, MMO); 7028 7029 return std::make_pair(FIST, StackSlot); 7030} 7031 7032SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7033 SelectionDAG &DAG) const { 7034 if (Op.getValueType().isVector()) 7035 return SDValue(); 7036 7037 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7038 SDValue FIST = Vals.first, StackSlot = Vals.second; 7039 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7040 if (FIST.getNode() == 0) return Op; 7041 7042 // Load the result. 7043 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7044 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7045} 7046 7047SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7048 SelectionDAG &DAG) const { 7049 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7050 SDValue FIST = Vals.first, StackSlot = Vals.second; 7051 assert(FIST.getNode() && "Unexpected failure"); 7052 7053 // Load the result. 7054 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7055 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7056} 7057 7058SDValue X86TargetLowering::LowerFABS(SDValue Op, 7059 SelectionDAG &DAG) const { 7060 LLVMContext *Context = DAG.getContext(); 7061 DebugLoc dl = Op.getDebugLoc(); 7062 EVT VT = Op.getValueType(); 7063 EVT EltVT = VT; 7064 if (VT.isVector()) 7065 EltVT = VT.getVectorElementType(); 7066 std::vector<Constant*> CV; 7067 if (EltVT == MVT::f64) { 7068 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7069 CV.push_back(C); 7070 CV.push_back(C); 7071 } else { 7072 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7073 CV.push_back(C); 7074 CV.push_back(C); 7075 CV.push_back(C); 7076 CV.push_back(C); 7077 } 7078 Constant *C = ConstantVector::get(CV); 7079 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7080 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7081 MachinePointerInfo::getConstantPool(), 7082 false, false, 16); 7083 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7084} 7085 7086SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7087 LLVMContext *Context = DAG.getContext(); 7088 DebugLoc dl = Op.getDebugLoc(); 7089 EVT VT = Op.getValueType(); 7090 EVT EltVT = VT; 7091 if (VT.isVector()) 7092 EltVT = VT.getVectorElementType(); 7093 std::vector<Constant*> CV; 7094 if (EltVT == MVT::f64) { 7095 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7096 CV.push_back(C); 7097 CV.push_back(C); 7098 } else { 7099 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7100 CV.push_back(C); 7101 CV.push_back(C); 7102 CV.push_back(C); 7103 CV.push_back(C); 7104 } 7105 Constant *C = ConstantVector::get(CV); 7106 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7107 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7108 MachinePointerInfo::getConstantPool(), 7109 false, false, 16); 7110 if (VT.isVector()) { 7111 return DAG.getNode(ISD::BITCAST, dl, VT, 7112 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7113 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7114 Op.getOperand(0)), 7115 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7116 } else { 7117 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7118 } 7119} 7120 7121SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7122 LLVMContext *Context = DAG.getContext(); 7123 SDValue Op0 = Op.getOperand(0); 7124 SDValue Op1 = Op.getOperand(1); 7125 DebugLoc dl = Op.getDebugLoc(); 7126 EVT VT = Op.getValueType(); 7127 EVT SrcVT = Op1.getValueType(); 7128 7129 // If second operand is smaller, extend it first. 7130 if (SrcVT.bitsLT(VT)) { 7131 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7132 SrcVT = VT; 7133 } 7134 // And if it is bigger, shrink it first. 7135 if (SrcVT.bitsGT(VT)) { 7136 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7137 SrcVT = VT; 7138 } 7139 7140 // At this point the operands and the result should have the same 7141 // type, and that won't be f80 since that is not custom lowered. 7142 7143 // First get the sign bit of second operand. 7144 std::vector<Constant*> CV; 7145 if (SrcVT == MVT::f64) { 7146 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7147 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7148 } else { 7149 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7150 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7151 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7152 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7153 } 7154 Constant *C = ConstantVector::get(CV); 7155 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7156 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7157 MachinePointerInfo::getConstantPool(), 7158 false, false, 16); 7159 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7160 7161 // Shift sign bit right or left if the two operands have different types. 7162 if (SrcVT.bitsGT(VT)) { 7163 // Op0 is MVT::f32, Op1 is MVT::f64. 7164 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7165 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7166 DAG.getConstant(32, MVT::i32)); 7167 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7168 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7169 DAG.getIntPtrConstant(0)); 7170 } 7171 7172 // Clear first operand sign bit. 7173 CV.clear(); 7174 if (VT == MVT::f64) { 7175 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7176 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7177 } else { 7178 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7179 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7180 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7181 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7182 } 7183 C = ConstantVector::get(CV); 7184 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7185 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7186 MachinePointerInfo::getConstantPool(), 7187 false, false, 16); 7188 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7189 7190 // Or the value with the sign bit. 7191 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7192} 7193 7194/// Emit nodes that will be selected as "test Op0,Op0", or something 7195/// equivalent. 7196SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7197 SelectionDAG &DAG) const { 7198 DebugLoc dl = Op.getDebugLoc(); 7199 7200 // CF and OF aren't always set the way we want. Determine which 7201 // of these we need. 7202 bool NeedCF = false; 7203 bool NeedOF = false; 7204 switch (X86CC) { 7205 default: break; 7206 case X86::COND_A: case X86::COND_AE: 7207 case X86::COND_B: case X86::COND_BE: 7208 NeedCF = true; 7209 break; 7210 case X86::COND_G: case X86::COND_GE: 7211 case X86::COND_L: case X86::COND_LE: 7212 case X86::COND_O: case X86::COND_NO: 7213 NeedOF = true; 7214 break; 7215 } 7216 7217 // See if we can use the EFLAGS value from the operand instead of 7218 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7219 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7220 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7221 // Emit a CMP with 0, which is the TEST pattern. 7222 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7223 DAG.getConstant(0, Op.getValueType())); 7224 7225 unsigned Opcode = 0; 7226 unsigned NumOperands = 0; 7227 switch (Op.getNode()->getOpcode()) { 7228 case ISD::ADD: 7229 // Due to an isel shortcoming, be conservative if this add is likely to be 7230 // selected as part of a load-modify-store instruction. When the root node 7231 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7232 // uses of other nodes in the match, such as the ADD in this case. This 7233 // leads to the ADD being left around and reselected, with the result being 7234 // two adds in the output. Alas, even if none our users are stores, that 7235 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7236 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7237 // climbing the DAG back to the root, and it doesn't seem to be worth the 7238 // effort. 7239 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7240 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7241 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7242 goto default_case; 7243 7244 if (ConstantSDNode *C = 7245 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7246 // An add of one will be selected as an INC. 7247 if (C->getAPIntValue() == 1) { 7248 Opcode = X86ISD::INC; 7249 NumOperands = 1; 7250 break; 7251 } 7252 7253 // An add of negative one (subtract of one) will be selected as a DEC. 7254 if (C->getAPIntValue().isAllOnesValue()) { 7255 Opcode = X86ISD::DEC; 7256 NumOperands = 1; 7257 break; 7258 } 7259 } 7260 7261 // Otherwise use a regular EFLAGS-setting add. 7262 Opcode = X86ISD::ADD; 7263 NumOperands = 2; 7264 break; 7265 case ISD::AND: { 7266 // If the primary and result isn't used, don't bother using X86ISD::AND, 7267 // because a TEST instruction will be better. 7268 bool NonFlagUse = false; 7269 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7270 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7271 SDNode *User = *UI; 7272 unsigned UOpNo = UI.getOperandNo(); 7273 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7274 // Look pass truncate. 7275 UOpNo = User->use_begin().getOperandNo(); 7276 User = *User->use_begin(); 7277 } 7278 7279 if (User->getOpcode() != ISD::BRCOND && 7280 User->getOpcode() != ISD::SETCC && 7281 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7282 NonFlagUse = true; 7283 break; 7284 } 7285 } 7286 7287 if (!NonFlagUse) 7288 break; 7289 } 7290 // FALL THROUGH 7291 case ISD::SUB: 7292 case ISD::OR: 7293 case ISD::XOR: 7294 // Due to the ISEL shortcoming noted above, be conservative if this op is 7295 // likely to be selected as part of a load-modify-store instruction. 7296 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7297 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7298 if (UI->getOpcode() == ISD::STORE) 7299 goto default_case; 7300 7301 // Otherwise use a regular EFLAGS-setting instruction. 7302 switch (Op.getNode()->getOpcode()) { 7303 default: llvm_unreachable("unexpected operator!"); 7304 case ISD::SUB: Opcode = X86ISD::SUB; break; 7305 case ISD::OR: Opcode = X86ISD::OR; break; 7306 case ISD::XOR: Opcode = X86ISD::XOR; break; 7307 case ISD::AND: Opcode = X86ISD::AND; break; 7308 } 7309 7310 NumOperands = 2; 7311 break; 7312 case X86ISD::ADD: 7313 case X86ISD::SUB: 7314 case X86ISD::INC: 7315 case X86ISD::DEC: 7316 case X86ISD::OR: 7317 case X86ISD::XOR: 7318 case X86ISD::AND: 7319 return SDValue(Op.getNode(), 1); 7320 default: 7321 default_case: 7322 break; 7323 } 7324 7325 if (Opcode == 0) 7326 // Emit a CMP with 0, which is the TEST pattern. 7327 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7328 DAG.getConstant(0, Op.getValueType())); 7329 7330 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7331 SmallVector<SDValue, 4> Ops; 7332 for (unsigned i = 0; i != NumOperands; ++i) 7333 Ops.push_back(Op.getOperand(i)); 7334 7335 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7336 DAG.ReplaceAllUsesWith(Op, New); 7337 return SDValue(New.getNode(), 1); 7338} 7339 7340/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7341/// equivalent. 7342SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7343 SelectionDAG &DAG) const { 7344 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7345 if (C->getAPIntValue() == 0) 7346 return EmitTest(Op0, X86CC, DAG); 7347 7348 DebugLoc dl = Op0.getDebugLoc(); 7349 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7350} 7351 7352/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7353/// if it's possible. 7354SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7355 DebugLoc dl, SelectionDAG &DAG) const { 7356 SDValue Op0 = And.getOperand(0); 7357 SDValue Op1 = And.getOperand(1); 7358 if (Op0.getOpcode() == ISD::TRUNCATE) 7359 Op0 = Op0.getOperand(0); 7360 if (Op1.getOpcode() == ISD::TRUNCATE) 7361 Op1 = Op1.getOperand(0); 7362 7363 SDValue LHS, RHS; 7364 if (Op1.getOpcode() == ISD::SHL) 7365 std::swap(Op0, Op1); 7366 if (Op0.getOpcode() == ISD::SHL) { 7367 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7368 if (And00C->getZExtValue() == 1) { 7369 // If we looked past a truncate, check that it's only truncating away 7370 // known zeros. 7371 unsigned BitWidth = Op0.getValueSizeInBits(); 7372 unsigned AndBitWidth = And.getValueSizeInBits(); 7373 if (BitWidth > AndBitWidth) { 7374 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7375 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7376 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7377 return SDValue(); 7378 } 7379 LHS = Op1; 7380 RHS = Op0.getOperand(1); 7381 } 7382 } else if (Op1.getOpcode() == ISD::Constant) { 7383 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7384 SDValue AndLHS = Op0; 7385 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7386 LHS = AndLHS.getOperand(0); 7387 RHS = AndLHS.getOperand(1); 7388 } 7389 } 7390 7391 if (LHS.getNode()) { 7392 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7393 // instruction. Since the shift amount is in-range-or-undefined, we know 7394 // that doing a bittest on the i32 value is ok. We extend to i32 because 7395 // the encoding for the i16 version is larger than the i32 version. 7396 // Also promote i16 to i32 for performance / code size reason. 7397 if (LHS.getValueType() == MVT::i8 || 7398 LHS.getValueType() == MVT::i16) 7399 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7400 7401 // If the operand types disagree, extend the shift amount to match. Since 7402 // BT ignores high bits (like shifts) we can use anyextend. 7403 if (LHS.getValueType() != RHS.getValueType()) 7404 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7405 7406 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7407 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7408 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7409 DAG.getConstant(Cond, MVT::i8), BT); 7410 } 7411 7412 return SDValue(); 7413} 7414 7415SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7416 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7417 SDValue Op0 = Op.getOperand(0); 7418 SDValue Op1 = Op.getOperand(1); 7419 DebugLoc dl = Op.getDebugLoc(); 7420 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7421 7422 // Optimize to BT if possible. 7423 // Lower (X & (1 << N)) == 0 to BT(X, N). 7424 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7425 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7426 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7427 Op1.getOpcode() == ISD::Constant && 7428 cast<ConstantSDNode>(Op1)->isNullValue() && 7429 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7430 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7431 if (NewSetCC.getNode()) 7432 return NewSetCC; 7433 } 7434 7435 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7436 // these. 7437 if (Op1.getOpcode() == ISD::Constant && 7438 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7439 cast<ConstantSDNode>(Op1)->isNullValue()) && 7440 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7441 7442 // If the input is a setcc, then reuse the input setcc or use a new one with 7443 // the inverted condition. 7444 if (Op0.getOpcode() == X86ISD::SETCC) { 7445 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7446 bool Invert = (CC == ISD::SETNE) ^ 7447 cast<ConstantSDNode>(Op1)->isNullValue(); 7448 if (!Invert) return Op0; 7449 7450 CCode = X86::GetOppositeBranchCondition(CCode); 7451 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7452 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7453 } 7454 } 7455 7456 bool isFP = Op1.getValueType().isFloatingPoint(); 7457 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7458 if (X86CC == X86::COND_INVALID) 7459 return SDValue(); 7460 7461 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7462 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7463 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7464} 7465 7466SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7467 SDValue Cond; 7468 SDValue Op0 = Op.getOperand(0); 7469 SDValue Op1 = Op.getOperand(1); 7470 SDValue CC = Op.getOperand(2); 7471 EVT VT = Op.getValueType(); 7472 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7473 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7474 DebugLoc dl = Op.getDebugLoc(); 7475 7476 if (isFP) { 7477 unsigned SSECC = 8; 7478 EVT VT0 = Op0.getValueType(); 7479 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7480 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7481 bool Swap = false; 7482 7483 switch (SetCCOpcode) { 7484 default: break; 7485 case ISD::SETOEQ: 7486 case ISD::SETEQ: SSECC = 0; break; 7487 case ISD::SETOGT: 7488 case ISD::SETGT: Swap = true; // Fallthrough 7489 case ISD::SETLT: 7490 case ISD::SETOLT: SSECC = 1; break; 7491 case ISD::SETOGE: 7492 case ISD::SETGE: Swap = true; // Fallthrough 7493 case ISD::SETLE: 7494 case ISD::SETOLE: SSECC = 2; break; 7495 case ISD::SETUO: SSECC = 3; break; 7496 case ISD::SETUNE: 7497 case ISD::SETNE: SSECC = 4; break; 7498 case ISD::SETULE: Swap = true; 7499 case ISD::SETUGE: SSECC = 5; break; 7500 case ISD::SETULT: Swap = true; 7501 case ISD::SETUGT: SSECC = 6; break; 7502 case ISD::SETO: SSECC = 7; break; 7503 } 7504 if (Swap) 7505 std::swap(Op0, Op1); 7506 7507 // In the two special cases we can't handle, emit two comparisons. 7508 if (SSECC == 8) { 7509 if (SetCCOpcode == ISD::SETUEQ) { 7510 SDValue UNORD, EQ; 7511 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7512 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7513 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7514 } 7515 else if (SetCCOpcode == ISD::SETONE) { 7516 SDValue ORD, NEQ; 7517 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7518 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7519 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7520 } 7521 llvm_unreachable("Illegal FP comparison"); 7522 } 7523 // Handle all other FP comparisons here. 7524 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7525 } 7526 7527 // We are handling one of the integer comparisons here. Since SSE only has 7528 // GT and EQ comparisons for integer, swapping operands and multiple 7529 // operations may be required for some comparisons. 7530 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7531 bool Swap = false, Invert = false, FlipSigns = false; 7532 7533 switch (VT.getSimpleVT().SimpleTy) { 7534 default: break; 7535 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7536 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7537 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7538 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7539 } 7540 7541 switch (SetCCOpcode) { 7542 default: break; 7543 case ISD::SETNE: Invert = true; 7544 case ISD::SETEQ: Opc = EQOpc; break; 7545 case ISD::SETLT: Swap = true; 7546 case ISD::SETGT: Opc = GTOpc; break; 7547 case ISD::SETGE: Swap = true; 7548 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7549 case ISD::SETULT: Swap = true; 7550 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7551 case ISD::SETUGE: Swap = true; 7552 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7553 } 7554 if (Swap) 7555 std::swap(Op0, Op1); 7556 7557 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7558 // bits of the inputs before performing those operations. 7559 if (FlipSigns) { 7560 EVT EltVT = VT.getVectorElementType(); 7561 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7562 EltVT); 7563 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7564 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7565 SignBits.size()); 7566 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7567 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7568 } 7569 7570 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7571 7572 // If the logical-not of the result is required, perform that now. 7573 if (Invert) 7574 Result = DAG.getNOT(dl, Result, VT); 7575 7576 return Result; 7577} 7578 7579// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7580static bool isX86LogicalCmp(SDValue Op) { 7581 unsigned Opc = Op.getNode()->getOpcode(); 7582 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7583 return true; 7584 if (Op.getResNo() == 1 && 7585 (Opc == X86ISD::ADD || 7586 Opc == X86ISD::SUB || 7587 Opc == X86ISD::ADC || 7588 Opc == X86ISD::SBB || 7589 Opc == X86ISD::SMUL || 7590 Opc == X86ISD::UMUL || 7591 Opc == X86ISD::INC || 7592 Opc == X86ISD::DEC || 7593 Opc == X86ISD::OR || 7594 Opc == X86ISD::XOR || 7595 Opc == X86ISD::AND)) 7596 return true; 7597 7598 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7599 return true; 7600 7601 return false; 7602} 7603 7604static bool isZero(SDValue V) { 7605 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7606 return C && C->isNullValue(); 7607} 7608 7609static bool isAllOnes(SDValue V) { 7610 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7611 return C && C->isAllOnesValue(); 7612} 7613 7614SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7615 bool addTest = true; 7616 SDValue Cond = Op.getOperand(0); 7617 SDValue Op1 = Op.getOperand(1); 7618 SDValue Op2 = Op.getOperand(2); 7619 DebugLoc DL = Op.getDebugLoc(); 7620 SDValue CC; 7621 7622 if (Cond.getOpcode() == ISD::SETCC) { 7623 SDValue NewCond = LowerSETCC(Cond, DAG); 7624 if (NewCond.getNode()) 7625 Cond = NewCond; 7626 } 7627 7628 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7629 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7630 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7631 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7632 if (Cond.getOpcode() == X86ISD::SETCC && 7633 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7634 isZero(Cond.getOperand(1).getOperand(1))) { 7635 SDValue Cmp = Cond.getOperand(1); 7636 7637 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7638 7639 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7640 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7641 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7642 7643 SDValue CmpOp0 = Cmp.getOperand(0); 7644 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7645 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7646 7647 SDValue Res = // Res = 0 or -1. 7648 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7649 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7650 7651 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7652 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7653 7654 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7655 if (N2C == 0 || !N2C->isNullValue()) 7656 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7657 return Res; 7658 } 7659 } 7660 7661 // Look past (and (setcc_carry (cmp ...)), 1). 7662 if (Cond.getOpcode() == ISD::AND && 7663 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7664 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7665 if (C && C->getAPIntValue() == 1) 7666 Cond = Cond.getOperand(0); 7667 } 7668 7669 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7670 // setting operand in place of the X86ISD::SETCC. 7671 if (Cond.getOpcode() == X86ISD::SETCC || 7672 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7673 CC = Cond.getOperand(0); 7674 7675 SDValue Cmp = Cond.getOperand(1); 7676 unsigned Opc = Cmp.getOpcode(); 7677 EVT VT = Op.getValueType(); 7678 7679 bool IllegalFPCMov = false; 7680 if (VT.isFloatingPoint() && !VT.isVector() && 7681 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7682 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7683 7684 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7685 Opc == X86ISD::BT) { // FIXME 7686 Cond = Cmp; 7687 addTest = false; 7688 } 7689 } 7690 7691 if (addTest) { 7692 // Look pass the truncate. 7693 if (Cond.getOpcode() == ISD::TRUNCATE) 7694 Cond = Cond.getOperand(0); 7695 7696 // We know the result of AND is compared against zero. Try to match 7697 // it to BT. 7698 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7699 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7700 if (NewSetCC.getNode()) { 7701 CC = NewSetCC.getOperand(0); 7702 Cond = NewSetCC.getOperand(1); 7703 addTest = false; 7704 } 7705 } 7706 } 7707 7708 if (addTest) { 7709 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7710 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7711 } 7712 7713 // a < b ? -1 : 0 -> RES = ~setcc_carry 7714 // a < b ? 0 : -1 -> RES = setcc_carry 7715 // a >= b ? -1 : 0 -> RES = setcc_carry 7716 // a >= b ? 0 : -1 -> RES = ~setcc_carry 7717 if (Cond.getOpcode() == X86ISD::CMP) { 7718 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 7719 7720 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 7721 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 7722 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7723 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 7724 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 7725 return DAG.getNOT(DL, Res, Res.getValueType()); 7726 return Res; 7727 } 7728 } 7729 7730 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7731 // condition is true. 7732 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 7733 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7734 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7735} 7736 7737// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7738// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7739// from the AND / OR. 7740static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7741 Opc = Op.getOpcode(); 7742 if (Opc != ISD::OR && Opc != ISD::AND) 7743 return false; 7744 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7745 Op.getOperand(0).hasOneUse() && 7746 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7747 Op.getOperand(1).hasOneUse()); 7748} 7749 7750// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7751// 1 and that the SETCC node has a single use. 7752static bool isXor1OfSetCC(SDValue Op) { 7753 if (Op.getOpcode() != ISD::XOR) 7754 return false; 7755 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7756 if (N1C && N1C->getAPIntValue() == 1) { 7757 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7758 Op.getOperand(0).hasOneUse(); 7759 } 7760 return false; 7761} 7762 7763SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7764 bool addTest = true; 7765 SDValue Chain = Op.getOperand(0); 7766 SDValue Cond = Op.getOperand(1); 7767 SDValue Dest = Op.getOperand(2); 7768 DebugLoc dl = Op.getDebugLoc(); 7769 SDValue CC; 7770 7771 if (Cond.getOpcode() == ISD::SETCC) { 7772 SDValue NewCond = LowerSETCC(Cond, DAG); 7773 if (NewCond.getNode()) 7774 Cond = NewCond; 7775 } 7776#if 0 7777 // FIXME: LowerXALUO doesn't handle these!! 7778 else if (Cond.getOpcode() == X86ISD::ADD || 7779 Cond.getOpcode() == X86ISD::SUB || 7780 Cond.getOpcode() == X86ISD::SMUL || 7781 Cond.getOpcode() == X86ISD::UMUL) 7782 Cond = LowerXALUO(Cond, DAG); 7783#endif 7784 7785 // Look pass (and (setcc_carry (cmp ...)), 1). 7786 if (Cond.getOpcode() == ISD::AND && 7787 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7788 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7789 if (C && C->getAPIntValue() == 1) 7790 Cond = Cond.getOperand(0); 7791 } 7792 7793 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7794 // setting operand in place of the X86ISD::SETCC. 7795 if (Cond.getOpcode() == X86ISD::SETCC || 7796 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7797 CC = Cond.getOperand(0); 7798 7799 SDValue Cmp = Cond.getOperand(1); 7800 unsigned Opc = Cmp.getOpcode(); 7801 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7802 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7803 Cond = Cmp; 7804 addTest = false; 7805 } else { 7806 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7807 default: break; 7808 case X86::COND_O: 7809 case X86::COND_B: 7810 // These can only come from an arithmetic instruction with overflow, 7811 // e.g. SADDO, UADDO. 7812 Cond = Cond.getNode()->getOperand(1); 7813 addTest = false; 7814 break; 7815 } 7816 } 7817 } else { 7818 unsigned CondOpc; 7819 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7820 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7821 if (CondOpc == ISD::OR) { 7822 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7823 // two branches instead of an explicit OR instruction with a 7824 // separate test. 7825 if (Cmp == Cond.getOperand(1).getOperand(1) && 7826 isX86LogicalCmp(Cmp)) { 7827 CC = Cond.getOperand(0).getOperand(0); 7828 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7829 Chain, Dest, CC, Cmp); 7830 CC = Cond.getOperand(1).getOperand(0); 7831 Cond = Cmp; 7832 addTest = false; 7833 } 7834 } else { // ISD::AND 7835 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7836 // two branches instead of an explicit AND instruction with a 7837 // separate test. However, we only do this if this block doesn't 7838 // have a fall-through edge, because this requires an explicit 7839 // jmp when the condition is false. 7840 if (Cmp == Cond.getOperand(1).getOperand(1) && 7841 isX86LogicalCmp(Cmp) && 7842 Op.getNode()->hasOneUse()) { 7843 X86::CondCode CCode = 7844 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7845 CCode = X86::GetOppositeBranchCondition(CCode); 7846 CC = DAG.getConstant(CCode, MVT::i8); 7847 SDNode *User = *Op.getNode()->use_begin(); 7848 // Look for an unconditional branch following this conditional branch. 7849 // We need this because we need to reverse the successors in order 7850 // to implement FCMP_OEQ. 7851 if (User->getOpcode() == ISD::BR) { 7852 SDValue FalseBB = User->getOperand(1); 7853 SDNode *NewBR = 7854 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7855 assert(NewBR == User); 7856 (void)NewBR; 7857 Dest = FalseBB; 7858 7859 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7860 Chain, Dest, CC, Cmp); 7861 X86::CondCode CCode = 7862 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7863 CCode = X86::GetOppositeBranchCondition(CCode); 7864 CC = DAG.getConstant(CCode, MVT::i8); 7865 Cond = Cmp; 7866 addTest = false; 7867 } 7868 } 7869 } 7870 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7871 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7872 // It should be transformed during dag combiner except when the condition 7873 // is set by a arithmetics with overflow node. 7874 X86::CondCode CCode = 7875 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7876 CCode = X86::GetOppositeBranchCondition(CCode); 7877 CC = DAG.getConstant(CCode, MVT::i8); 7878 Cond = Cond.getOperand(0).getOperand(1); 7879 addTest = false; 7880 } 7881 } 7882 7883 if (addTest) { 7884 // Look pass the truncate. 7885 if (Cond.getOpcode() == ISD::TRUNCATE) 7886 Cond = Cond.getOperand(0); 7887 7888 // We know the result of AND is compared against zero. Try to match 7889 // it to BT. 7890 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7891 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7892 if (NewSetCC.getNode()) { 7893 CC = NewSetCC.getOperand(0); 7894 Cond = NewSetCC.getOperand(1); 7895 addTest = false; 7896 } 7897 } 7898 } 7899 7900 if (addTest) { 7901 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7902 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7903 } 7904 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7905 Chain, Dest, CC, Cond); 7906} 7907 7908 7909// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7910// Calls to _alloca is needed to probe the stack when allocating more than 4k 7911// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7912// that the guard pages used by the OS virtual memory manager are allocated in 7913// correct sequence. 7914SDValue 7915X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7916 SelectionDAG &DAG) const { 7917 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7918 "This should be used only on Windows targets"); 7919 DebugLoc dl = Op.getDebugLoc(); 7920 7921 // Get the inputs. 7922 SDValue Chain = Op.getOperand(0); 7923 SDValue Size = Op.getOperand(1); 7924 // FIXME: Ensure alignment here 7925 7926 SDValue Flag; 7927 7928 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7929 7930 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7931 Flag = Chain.getValue(1); 7932 7933 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7934 7935 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7936 Flag = Chain.getValue(1); 7937 7938 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7939 7940 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7941 return DAG.getMergeValues(Ops1, 2, dl); 7942} 7943 7944SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7945 MachineFunction &MF = DAG.getMachineFunction(); 7946 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7947 7948 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7949 DebugLoc DL = Op.getDebugLoc(); 7950 7951 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7952 // vastart just stores the address of the VarArgsFrameIndex slot into the 7953 // memory location argument. 7954 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7955 getPointerTy()); 7956 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7957 MachinePointerInfo(SV), false, false, 0); 7958 } 7959 7960 // __va_list_tag: 7961 // gp_offset (0 - 6 * 8) 7962 // fp_offset (48 - 48 + 8 * 16) 7963 // overflow_arg_area (point to parameters coming in memory). 7964 // reg_save_area 7965 SmallVector<SDValue, 8> MemOps; 7966 SDValue FIN = Op.getOperand(1); 7967 // Store gp_offset 7968 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7969 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7970 MVT::i32), 7971 FIN, MachinePointerInfo(SV), false, false, 0); 7972 MemOps.push_back(Store); 7973 7974 // Store fp_offset 7975 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7976 FIN, DAG.getIntPtrConstant(4)); 7977 Store = DAG.getStore(Op.getOperand(0), DL, 7978 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7979 MVT::i32), 7980 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7981 MemOps.push_back(Store); 7982 7983 // Store ptr to overflow_arg_area 7984 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7985 FIN, DAG.getIntPtrConstant(4)); 7986 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7987 getPointerTy()); 7988 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7989 MachinePointerInfo(SV, 8), 7990 false, false, 0); 7991 MemOps.push_back(Store); 7992 7993 // Store ptr to reg_save_area. 7994 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7995 FIN, DAG.getIntPtrConstant(8)); 7996 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7997 getPointerTy()); 7998 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7999 MachinePointerInfo(SV, 16), false, false, 0); 8000 MemOps.push_back(Store); 8001 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8002 &MemOps[0], MemOps.size()); 8003} 8004 8005SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8006 assert(Subtarget->is64Bit() && 8007 "LowerVAARG only handles 64-bit va_arg!"); 8008 assert((Subtarget->isTargetLinux() || 8009 Subtarget->isTargetDarwin()) && 8010 "Unhandled target in LowerVAARG"); 8011 assert(Op.getNode()->getNumOperands() == 4); 8012 SDValue Chain = Op.getOperand(0); 8013 SDValue SrcPtr = Op.getOperand(1); 8014 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8015 unsigned Align = Op.getConstantOperandVal(3); 8016 DebugLoc dl = Op.getDebugLoc(); 8017 8018 EVT ArgVT = Op.getNode()->getValueType(0); 8019 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8020 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8021 uint8_t ArgMode; 8022 8023 // Decide which area this value should be read from. 8024 // TODO: Implement the AMD64 ABI in its entirety. This simple 8025 // selection mechanism works only for the basic types. 8026 if (ArgVT == MVT::f80) { 8027 llvm_unreachable("va_arg for f80 not yet implemented"); 8028 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8029 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8030 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8031 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8032 } else { 8033 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8034 } 8035 8036 if (ArgMode == 2) { 8037 // Sanity Check: Make sure using fp_offset makes sense. 8038 assert(!UseSoftFloat && 8039 !(DAG.getMachineFunction() 8040 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8041 Subtarget->hasXMM()); 8042 } 8043 8044 // Insert VAARG_64 node into the DAG 8045 // VAARG_64 returns two values: Variable Argument Address, Chain 8046 SmallVector<SDValue, 11> InstOps; 8047 InstOps.push_back(Chain); 8048 InstOps.push_back(SrcPtr); 8049 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8050 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8051 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8052 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8053 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8054 VTs, &InstOps[0], InstOps.size(), 8055 MVT::i64, 8056 MachinePointerInfo(SV), 8057 /*Align=*/0, 8058 /*Volatile=*/false, 8059 /*ReadMem=*/true, 8060 /*WriteMem=*/true); 8061 Chain = VAARG.getValue(1); 8062 8063 // Load the next argument and return it 8064 return DAG.getLoad(ArgVT, dl, 8065 Chain, 8066 VAARG, 8067 MachinePointerInfo(), 8068 false, false, 0); 8069} 8070 8071SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8072 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8073 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8074 SDValue Chain = Op.getOperand(0); 8075 SDValue DstPtr = Op.getOperand(1); 8076 SDValue SrcPtr = Op.getOperand(2); 8077 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8078 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8079 DebugLoc DL = Op.getDebugLoc(); 8080 8081 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8082 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8083 false, 8084 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8085} 8086 8087SDValue 8088X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8089 DebugLoc dl = Op.getDebugLoc(); 8090 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8091 switch (IntNo) { 8092 default: return SDValue(); // Don't custom lower most intrinsics. 8093 // Comparison intrinsics. 8094 case Intrinsic::x86_sse_comieq_ss: 8095 case Intrinsic::x86_sse_comilt_ss: 8096 case Intrinsic::x86_sse_comile_ss: 8097 case Intrinsic::x86_sse_comigt_ss: 8098 case Intrinsic::x86_sse_comige_ss: 8099 case Intrinsic::x86_sse_comineq_ss: 8100 case Intrinsic::x86_sse_ucomieq_ss: 8101 case Intrinsic::x86_sse_ucomilt_ss: 8102 case Intrinsic::x86_sse_ucomile_ss: 8103 case Intrinsic::x86_sse_ucomigt_ss: 8104 case Intrinsic::x86_sse_ucomige_ss: 8105 case Intrinsic::x86_sse_ucomineq_ss: 8106 case Intrinsic::x86_sse2_comieq_sd: 8107 case Intrinsic::x86_sse2_comilt_sd: 8108 case Intrinsic::x86_sse2_comile_sd: 8109 case Intrinsic::x86_sse2_comigt_sd: 8110 case Intrinsic::x86_sse2_comige_sd: 8111 case Intrinsic::x86_sse2_comineq_sd: 8112 case Intrinsic::x86_sse2_ucomieq_sd: 8113 case Intrinsic::x86_sse2_ucomilt_sd: 8114 case Intrinsic::x86_sse2_ucomile_sd: 8115 case Intrinsic::x86_sse2_ucomigt_sd: 8116 case Intrinsic::x86_sse2_ucomige_sd: 8117 case Intrinsic::x86_sse2_ucomineq_sd: { 8118 unsigned Opc = 0; 8119 ISD::CondCode CC = ISD::SETCC_INVALID; 8120 switch (IntNo) { 8121 default: break; 8122 case Intrinsic::x86_sse_comieq_ss: 8123 case Intrinsic::x86_sse2_comieq_sd: 8124 Opc = X86ISD::COMI; 8125 CC = ISD::SETEQ; 8126 break; 8127 case Intrinsic::x86_sse_comilt_ss: 8128 case Intrinsic::x86_sse2_comilt_sd: 8129 Opc = X86ISD::COMI; 8130 CC = ISD::SETLT; 8131 break; 8132 case Intrinsic::x86_sse_comile_ss: 8133 case Intrinsic::x86_sse2_comile_sd: 8134 Opc = X86ISD::COMI; 8135 CC = ISD::SETLE; 8136 break; 8137 case Intrinsic::x86_sse_comigt_ss: 8138 case Intrinsic::x86_sse2_comigt_sd: 8139 Opc = X86ISD::COMI; 8140 CC = ISD::SETGT; 8141 break; 8142 case Intrinsic::x86_sse_comige_ss: 8143 case Intrinsic::x86_sse2_comige_sd: 8144 Opc = X86ISD::COMI; 8145 CC = ISD::SETGE; 8146 break; 8147 case Intrinsic::x86_sse_comineq_ss: 8148 case Intrinsic::x86_sse2_comineq_sd: 8149 Opc = X86ISD::COMI; 8150 CC = ISD::SETNE; 8151 break; 8152 case Intrinsic::x86_sse_ucomieq_ss: 8153 case Intrinsic::x86_sse2_ucomieq_sd: 8154 Opc = X86ISD::UCOMI; 8155 CC = ISD::SETEQ; 8156 break; 8157 case Intrinsic::x86_sse_ucomilt_ss: 8158 case Intrinsic::x86_sse2_ucomilt_sd: 8159 Opc = X86ISD::UCOMI; 8160 CC = ISD::SETLT; 8161 break; 8162 case Intrinsic::x86_sse_ucomile_ss: 8163 case Intrinsic::x86_sse2_ucomile_sd: 8164 Opc = X86ISD::UCOMI; 8165 CC = ISD::SETLE; 8166 break; 8167 case Intrinsic::x86_sse_ucomigt_ss: 8168 case Intrinsic::x86_sse2_ucomigt_sd: 8169 Opc = X86ISD::UCOMI; 8170 CC = ISD::SETGT; 8171 break; 8172 case Intrinsic::x86_sse_ucomige_ss: 8173 case Intrinsic::x86_sse2_ucomige_sd: 8174 Opc = X86ISD::UCOMI; 8175 CC = ISD::SETGE; 8176 break; 8177 case Intrinsic::x86_sse_ucomineq_ss: 8178 case Intrinsic::x86_sse2_ucomineq_sd: 8179 Opc = X86ISD::UCOMI; 8180 CC = ISD::SETNE; 8181 break; 8182 } 8183 8184 SDValue LHS = Op.getOperand(1); 8185 SDValue RHS = Op.getOperand(2); 8186 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8187 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8188 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8189 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8190 DAG.getConstant(X86CC, MVT::i8), Cond); 8191 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8192 } 8193 // ptest and testp intrinsics. The intrinsic these come from are designed to 8194 // return an integer value, not just an instruction so lower it to the ptest 8195 // or testp pattern and a setcc for the result. 8196 case Intrinsic::x86_sse41_ptestz: 8197 case Intrinsic::x86_sse41_ptestc: 8198 case Intrinsic::x86_sse41_ptestnzc: 8199 case Intrinsic::x86_avx_ptestz_256: 8200 case Intrinsic::x86_avx_ptestc_256: 8201 case Intrinsic::x86_avx_ptestnzc_256: 8202 case Intrinsic::x86_avx_vtestz_ps: 8203 case Intrinsic::x86_avx_vtestc_ps: 8204 case Intrinsic::x86_avx_vtestnzc_ps: 8205 case Intrinsic::x86_avx_vtestz_pd: 8206 case Intrinsic::x86_avx_vtestc_pd: 8207 case Intrinsic::x86_avx_vtestnzc_pd: 8208 case Intrinsic::x86_avx_vtestz_ps_256: 8209 case Intrinsic::x86_avx_vtestc_ps_256: 8210 case Intrinsic::x86_avx_vtestnzc_ps_256: 8211 case Intrinsic::x86_avx_vtestz_pd_256: 8212 case Intrinsic::x86_avx_vtestc_pd_256: 8213 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8214 bool IsTestPacked = false; 8215 unsigned X86CC = 0; 8216 switch (IntNo) { 8217 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8218 case Intrinsic::x86_avx_vtestz_ps: 8219 case Intrinsic::x86_avx_vtestz_pd: 8220 case Intrinsic::x86_avx_vtestz_ps_256: 8221 case Intrinsic::x86_avx_vtestz_pd_256: 8222 IsTestPacked = true; // Fallthrough 8223 case Intrinsic::x86_sse41_ptestz: 8224 case Intrinsic::x86_avx_ptestz_256: 8225 // ZF = 1 8226 X86CC = X86::COND_E; 8227 break; 8228 case Intrinsic::x86_avx_vtestc_ps: 8229 case Intrinsic::x86_avx_vtestc_pd: 8230 case Intrinsic::x86_avx_vtestc_ps_256: 8231 case Intrinsic::x86_avx_vtestc_pd_256: 8232 IsTestPacked = true; // Fallthrough 8233 case Intrinsic::x86_sse41_ptestc: 8234 case Intrinsic::x86_avx_ptestc_256: 8235 // CF = 1 8236 X86CC = X86::COND_B; 8237 break; 8238 case Intrinsic::x86_avx_vtestnzc_ps: 8239 case Intrinsic::x86_avx_vtestnzc_pd: 8240 case Intrinsic::x86_avx_vtestnzc_ps_256: 8241 case Intrinsic::x86_avx_vtestnzc_pd_256: 8242 IsTestPacked = true; // Fallthrough 8243 case Intrinsic::x86_sse41_ptestnzc: 8244 case Intrinsic::x86_avx_ptestnzc_256: 8245 // ZF and CF = 0 8246 X86CC = X86::COND_A; 8247 break; 8248 } 8249 8250 SDValue LHS = Op.getOperand(1); 8251 SDValue RHS = Op.getOperand(2); 8252 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8253 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8254 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8255 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8256 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8257 } 8258 8259 // Fix vector shift instructions where the last operand is a non-immediate 8260 // i32 value. 8261 case Intrinsic::x86_sse2_pslli_w: 8262 case Intrinsic::x86_sse2_pslli_d: 8263 case Intrinsic::x86_sse2_pslli_q: 8264 case Intrinsic::x86_sse2_psrli_w: 8265 case Intrinsic::x86_sse2_psrli_d: 8266 case Intrinsic::x86_sse2_psrli_q: 8267 case Intrinsic::x86_sse2_psrai_w: 8268 case Intrinsic::x86_sse2_psrai_d: 8269 case Intrinsic::x86_mmx_pslli_w: 8270 case Intrinsic::x86_mmx_pslli_d: 8271 case Intrinsic::x86_mmx_pslli_q: 8272 case Intrinsic::x86_mmx_psrli_w: 8273 case Intrinsic::x86_mmx_psrli_d: 8274 case Intrinsic::x86_mmx_psrli_q: 8275 case Intrinsic::x86_mmx_psrai_w: 8276 case Intrinsic::x86_mmx_psrai_d: { 8277 SDValue ShAmt = Op.getOperand(2); 8278 if (isa<ConstantSDNode>(ShAmt)) 8279 return SDValue(); 8280 8281 unsigned NewIntNo = 0; 8282 EVT ShAmtVT = MVT::v4i32; 8283 switch (IntNo) { 8284 case Intrinsic::x86_sse2_pslli_w: 8285 NewIntNo = Intrinsic::x86_sse2_psll_w; 8286 break; 8287 case Intrinsic::x86_sse2_pslli_d: 8288 NewIntNo = Intrinsic::x86_sse2_psll_d; 8289 break; 8290 case Intrinsic::x86_sse2_pslli_q: 8291 NewIntNo = Intrinsic::x86_sse2_psll_q; 8292 break; 8293 case Intrinsic::x86_sse2_psrli_w: 8294 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8295 break; 8296 case Intrinsic::x86_sse2_psrli_d: 8297 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8298 break; 8299 case Intrinsic::x86_sse2_psrli_q: 8300 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8301 break; 8302 case Intrinsic::x86_sse2_psrai_w: 8303 NewIntNo = Intrinsic::x86_sse2_psra_w; 8304 break; 8305 case Intrinsic::x86_sse2_psrai_d: 8306 NewIntNo = Intrinsic::x86_sse2_psra_d; 8307 break; 8308 default: { 8309 ShAmtVT = MVT::v2i32; 8310 switch (IntNo) { 8311 case Intrinsic::x86_mmx_pslli_w: 8312 NewIntNo = Intrinsic::x86_mmx_psll_w; 8313 break; 8314 case Intrinsic::x86_mmx_pslli_d: 8315 NewIntNo = Intrinsic::x86_mmx_psll_d; 8316 break; 8317 case Intrinsic::x86_mmx_pslli_q: 8318 NewIntNo = Intrinsic::x86_mmx_psll_q; 8319 break; 8320 case Intrinsic::x86_mmx_psrli_w: 8321 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8322 break; 8323 case Intrinsic::x86_mmx_psrli_d: 8324 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8325 break; 8326 case Intrinsic::x86_mmx_psrli_q: 8327 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8328 break; 8329 case Intrinsic::x86_mmx_psrai_w: 8330 NewIntNo = Intrinsic::x86_mmx_psra_w; 8331 break; 8332 case Intrinsic::x86_mmx_psrai_d: 8333 NewIntNo = Intrinsic::x86_mmx_psra_d; 8334 break; 8335 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8336 } 8337 break; 8338 } 8339 } 8340 8341 // The vector shift intrinsics with scalars uses 32b shift amounts but 8342 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8343 // to be zero. 8344 SDValue ShOps[4]; 8345 ShOps[0] = ShAmt; 8346 ShOps[1] = DAG.getConstant(0, MVT::i32); 8347 if (ShAmtVT == MVT::v4i32) { 8348 ShOps[2] = DAG.getUNDEF(MVT::i32); 8349 ShOps[3] = DAG.getUNDEF(MVT::i32); 8350 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8351 } else { 8352 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8353// FIXME this must be lowered to get rid of the invalid type. 8354 } 8355 8356 EVT VT = Op.getValueType(); 8357 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8358 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8359 DAG.getConstant(NewIntNo, MVT::i32), 8360 Op.getOperand(1), ShAmt); 8361 } 8362 } 8363} 8364 8365SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8366 SelectionDAG &DAG) const { 8367 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8368 MFI->setReturnAddressIsTaken(true); 8369 8370 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8371 DebugLoc dl = Op.getDebugLoc(); 8372 8373 if (Depth > 0) { 8374 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8375 SDValue Offset = 8376 DAG.getConstant(TD->getPointerSize(), 8377 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8378 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8379 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8380 FrameAddr, Offset), 8381 MachinePointerInfo(), false, false, 0); 8382 } 8383 8384 // Just load the return address. 8385 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8386 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8387 RetAddrFI, MachinePointerInfo(), false, false, 0); 8388} 8389 8390SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8391 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8392 MFI->setFrameAddressIsTaken(true); 8393 8394 EVT VT = Op.getValueType(); 8395 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8396 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8397 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8398 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8399 while (Depth--) 8400 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8401 MachinePointerInfo(), 8402 false, false, 0); 8403 return FrameAddr; 8404} 8405 8406SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8407 SelectionDAG &DAG) const { 8408 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8409} 8410 8411SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8412 MachineFunction &MF = DAG.getMachineFunction(); 8413 SDValue Chain = Op.getOperand(0); 8414 SDValue Offset = Op.getOperand(1); 8415 SDValue Handler = Op.getOperand(2); 8416 DebugLoc dl = Op.getDebugLoc(); 8417 8418 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8419 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8420 getPointerTy()); 8421 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8422 8423 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8424 DAG.getIntPtrConstant(TD->getPointerSize())); 8425 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8426 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8427 false, false, 0); 8428 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8429 MF.getRegInfo().addLiveOut(StoreAddrReg); 8430 8431 return DAG.getNode(X86ISD::EH_RETURN, dl, 8432 MVT::Other, 8433 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8434} 8435 8436SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8437 SelectionDAG &DAG) const { 8438 SDValue Root = Op.getOperand(0); 8439 SDValue Trmp = Op.getOperand(1); // trampoline 8440 SDValue FPtr = Op.getOperand(2); // nested function 8441 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8442 DebugLoc dl = Op.getDebugLoc(); 8443 8444 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8445 8446 if (Subtarget->is64Bit()) { 8447 SDValue OutChains[6]; 8448 8449 // Large code-model. 8450 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8451 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8452 8453 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8454 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8455 8456 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8457 8458 // Load the pointer to the nested function into R11. 8459 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8460 SDValue Addr = Trmp; 8461 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8462 Addr, MachinePointerInfo(TrmpAddr), 8463 false, false, 0); 8464 8465 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8466 DAG.getConstant(2, MVT::i64)); 8467 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8468 MachinePointerInfo(TrmpAddr, 2), 8469 false, false, 2); 8470 8471 // Load the 'nest' parameter value into R10. 8472 // R10 is specified in X86CallingConv.td 8473 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8474 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8475 DAG.getConstant(10, MVT::i64)); 8476 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8477 Addr, MachinePointerInfo(TrmpAddr, 10), 8478 false, false, 0); 8479 8480 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8481 DAG.getConstant(12, MVT::i64)); 8482 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8483 MachinePointerInfo(TrmpAddr, 12), 8484 false, false, 2); 8485 8486 // Jump to the nested function. 8487 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8488 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8489 DAG.getConstant(20, MVT::i64)); 8490 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8491 Addr, MachinePointerInfo(TrmpAddr, 20), 8492 false, false, 0); 8493 8494 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8495 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8496 DAG.getConstant(22, MVT::i64)); 8497 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8498 MachinePointerInfo(TrmpAddr, 22), 8499 false, false, 0); 8500 8501 SDValue Ops[] = 8502 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8503 return DAG.getMergeValues(Ops, 2, dl); 8504 } else { 8505 const Function *Func = 8506 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8507 CallingConv::ID CC = Func->getCallingConv(); 8508 unsigned NestReg; 8509 8510 switch (CC) { 8511 default: 8512 llvm_unreachable("Unsupported calling convention"); 8513 case CallingConv::C: 8514 case CallingConv::X86_StdCall: { 8515 // Pass 'nest' parameter in ECX. 8516 // Must be kept in sync with X86CallingConv.td 8517 NestReg = X86::ECX; 8518 8519 // Check that ECX wasn't needed by an 'inreg' parameter. 8520 const FunctionType *FTy = Func->getFunctionType(); 8521 const AttrListPtr &Attrs = Func->getAttributes(); 8522 8523 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8524 unsigned InRegCount = 0; 8525 unsigned Idx = 1; 8526 8527 for (FunctionType::param_iterator I = FTy->param_begin(), 8528 E = FTy->param_end(); I != E; ++I, ++Idx) 8529 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8530 // FIXME: should only count parameters that are lowered to integers. 8531 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8532 8533 if (InRegCount > 2) { 8534 report_fatal_error("Nest register in use - reduce number of inreg" 8535 " parameters!"); 8536 } 8537 } 8538 break; 8539 } 8540 case CallingConv::X86_FastCall: 8541 case CallingConv::X86_ThisCall: 8542 case CallingConv::Fast: 8543 // Pass 'nest' parameter in EAX. 8544 // Must be kept in sync with X86CallingConv.td 8545 NestReg = X86::EAX; 8546 break; 8547 } 8548 8549 SDValue OutChains[4]; 8550 SDValue Addr, Disp; 8551 8552 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8553 DAG.getConstant(10, MVT::i32)); 8554 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8555 8556 // This is storing the opcode for MOV32ri. 8557 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8558 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8559 OutChains[0] = DAG.getStore(Root, dl, 8560 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8561 Trmp, MachinePointerInfo(TrmpAddr), 8562 false, false, 0); 8563 8564 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8565 DAG.getConstant(1, MVT::i32)); 8566 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8567 MachinePointerInfo(TrmpAddr, 1), 8568 false, false, 1); 8569 8570 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8571 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8572 DAG.getConstant(5, MVT::i32)); 8573 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8574 MachinePointerInfo(TrmpAddr, 5), 8575 false, false, 1); 8576 8577 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8578 DAG.getConstant(6, MVT::i32)); 8579 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8580 MachinePointerInfo(TrmpAddr, 6), 8581 false, false, 1); 8582 8583 SDValue Ops[] = 8584 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8585 return DAG.getMergeValues(Ops, 2, dl); 8586 } 8587} 8588 8589SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8590 SelectionDAG &DAG) const { 8591 /* 8592 The rounding mode is in bits 11:10 of FPSR, and has the following 8593 settings: 8594 00 Round to nearest 8595 01 Round to -inf 8596 10 Round to +inf 8597 11 Round to 0 8598 8599 FLT_ROUNDS, on the other hand, expects the following: 8600 -1 Undefined 8601 0 Round to 0 8602 1 Round to nearest 8603 2 Round to +inf 8604 3 Round to -inf 8605 8606 To perform the conversion, we do: 8607 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8608 */ 8609 8610 MachineFunction &MF = DAG.getMachineFunction(); 8611 const TargetMachine &TM = MF.getTarget(); 8612 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8613 unsigned StackAlignment = TFI.getStackAlignment(); 8614 EVT VT = Op.getValueType(); 8615 DebugLoc DL = Op.getDebugLoc(); 8616 8617 // Save FP Control Word to stack slot 8618 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8619 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8620 8621 8622 MachineMemOperand *MMO = 8623 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8624 MachineMemOperand::MOStore, 2, 2); 8625 8626 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8627 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8628 DAG.getVTList(MVT::Other), 8629 Ops, 2, MVT::i16, MMO); 8630 8631 // Load FP Control Word from stack slot 8632 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8633 MachinePointerInfo(), false, false, 0); 8634 8635 // Transform as necessary 8636 SDValue CWD1 = 8637 DAG.getNode(ISD::SRL, DL, MVT::i16, 8638 DAG.getNode(ISD::AND, DL, MVT::i16, 8639 CWD, DAG.getConstant(0x800, MVT::i16)), 8640 DAG.getConstant(11, MVT::i8)); 8641 SDValue CWD2 = 8642 DAG.getNode(ISD::SRL, DL, MVT::i16, 8643 DAG.getNode(ISD::AND, DL, MVT::i16, 8644 CWD, DAG.getConstant(0x400, MVT::i16)), 8645 DAG.getConstant(9, MVT::i8)); 8646 8647 SDValue RetVal = 8648 DAG.getNode(ISD::AND, DL, MVT::i16, 8649 DAG.getNode(ISD::ADD, DL, MVT::i16, 8650 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8651 DAG.getConstant(1, MVT::i16)), 8652 DAG.getConstant(3, MVT::i16)); 8653 8654 8655 return DAG.getNode((VT.getSizeInBits() < 16 ? 8656 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8657} 8658 8659SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8660 EVT VT = Op.getValueType(); 8661 EVT OpVT = VT; 8662 unsigned NumBits = VT.getSizeInBits(); 8663 DebugLoc dl = Op.getDebugLoc(); 8664 8665 Op = Op.getOperand(0); 8666 if (VT == MVT::i8) { 8667 // Zero extend to i32 since there is not an i8 bsr. 8668 OpVT = MVT::i32; 8669 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8670 } 8671 8672 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8673 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8674 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8675 8676 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8677 SDValue Ops[] = { 8678 Op, 8679 DAG.getConstant(NumBits+NumBits-1, OpVT), 8680 DAG.getConstant(X86::COND_E, MVT::i8), 8681 Op.getValue(1) 8682 }; 8683 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8684 8685 // Finally xor with NumBits-1. 8686 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8687 8688 if (VT == MVT::i8) 8689 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8690 return Op; 8691} 8692 8693SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8694 EVT VT = Op.getValueType(); 8695 EVT OpVT = VT; 8696 unsigned NumBits = VT.getSizeInBits(); 8697 DebugLoc dl = Op.getDebugLoc(); 8698 8699 Op = Op.getOperand(0); 8700 if (VT == MVT::i8) { 8701 OpVT = MVT::i32; 8702 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8703 } 8704 8705 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8706 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8707 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8708 8709 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8710 SDValue Ops[] = { 8711 Op, 8712 DAG.getConstant(NumBits, OpVT), 8713 DAG.getConstant(X86::COND_E, MVT::i8), 8714 Op.getValue(1) 8715 }; 8716 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8717 8718 if (VT == MVT::i8) 8719 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8720 return Op; 8721} 8722 8723SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8724 EVT VT = Op.getValueType(); 8725 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8726 DebugLoc dl = Op.getDebugLoc(); 8727 8728 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8729 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8730 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8731 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8732 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8733 // 8734 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8735 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8736 // return AloBlo + AloBhi + AhiBlo; 8737 8738 SDValue A = Op.getOperand(0); 8739 SDValue B = Op.getOperand(1); 8740 8741 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8742 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8743 A, DAG.getConstant(32, MVT::i32)); 8744 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8745 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8746 B, DAG.getConstant(32, MVT::i32)); 8747 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8748 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8749 A, B); 8750 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8751 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8752 A, Bhi); 8753 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8754 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8755 Ahi, B); 8756 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8757 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8758 AloBhi, DAG.getConstant(32, MVT::i32)); 8759 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8760 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8761 AhiBlo, DAG.getConstant(32, MVT::i32)); 8762 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8763 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8764 return Res; 8765} 8766 8767SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8768 EVT VT = Op.getValueType(); 8769 DebugLoc dl = Op.getDebugLoc(); 8770 SDValue R = Op.getOperand(0); 8771 8772 LLVMContext *Context = DAG.getContext(); 8773 8774 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8775 8776 if (VT == MVT::v4i32) { 8777 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8778 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8779 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8780 8781 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8782 8783 std::vector<Constant*> CV(4, CI); 8784 Constant *C = ConstantVector::get(CV); 8785 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8786 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8787 MachinePointerInfo::getConstantPool(), 8788 false, false, 16); 8789 8790 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8791 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 8792 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8793 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8794 } 8795 if (VT == MVT::v16i8) { 8796 // a = a << 5; 8797 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8798 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8799 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8800 8801 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8802 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8803 8804 std::vector<Constant*> CVM1(16, CM1); 8805 std::vector<Constant*> CVM2(16, CM2); 8806 Constant *C = ConstantVector::get(CVM1); 8807 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8808 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8809 MachinePointerInfo::getConstantPool(), 8810 false, false, 16); 8811 8812 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8813 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8814 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8815 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8816 DAG.getConstant(4, MVT::i32)); 8817 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8818 // a += a 8819 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8820 8821 C = ConstantVector::get(CVM2); 8822 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8823 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8824 MachinePointerInfo::getConstantPool(), 8825 false, false, 16); 8826 8827 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8828 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8829 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8830 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8831 DAG.getConstant(2, MVT::i32)); 8832 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8833 // a += a 8834 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8835 8836 // return pblendv(r, r+r, a); 8837 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 8838 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8839 return R; 8840 } 8841 return SDValue(); 8842} 8843 8844SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8845 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8846 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8847 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8848 // has only one use. 8849 SDNode *N = Op.getNode(); 8850 SDValue LHS = N->getOperand(0); 8851 SDValue RHS = N->getOperand(1); 8852 unsigned BaseOp = 0; 8853 unsigned Cond = 0; 8854 DebugLoc DL = Op.getDebugLoc(); 8855 switch (Op.getOpcode()) { 8856 default: llvm_unreachable("Unknown ovf instruction!"); 8857 case ISD::SADDO: 8858 // A subtract of one will be selected as a INC. Note that INC doesn't 8859 // set CF, so we can't do this for UADDO. 8860 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 8861 if (C->isOne()) { 8862 BaseOp = X86ISD::INC; 8863 Cond = X86::COND_O; 8864 break; 8865 } 8866 BaseOp = X86ISD::ADD; 8867 Cond = X86::COND_O; 8868 break; 8869 case ISD::UADDO: 8870 BaseOp = X86ISD::ADD; 8871 Cond = X86::COND_B; 8872 break; 8873 case ISD::SSUBO: 8874 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8875 // set CF, so we can't do this for USUBO. 8876 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 8877 if (C->isOne()) { 8878 BaseOp = X86ISD::DEC; 8879 Cond = X86::COND_O; 8880 break; 8881 } 8882 BaseOp = X86ISD::SUB; 8883 Cond = X86::COND_O; 8884 break; 8885 case ISD::USUBO: 8886 BaseOp = X86ISD::SUB; 8887 Cond = X86::COND_B; 8888 break; 8889 case ISD::SMULO: 8890 BaseOp = X86ISD::SMUL; 8891 Cond = X86::COND_O; 8892 break; 8893 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 8894 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 8895 MVT::i32); 8896 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 8897 8898 SDValue SetCC = 8899 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8900 DAG.getConstant(X86::COND_O, MVT::i32), 8901 SDValue(Sum.getNode(), 2)); 8902 8903 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8904 return Sum; 8905 } 8906 } 8907 8908 // Also sets EFLAGS. 8909 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8910 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 8911 8912 SDValue SetCC = 8913 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 8914 DAG.getConstant(Cond, MVT::i32), 8915 SDValue(Sum.getNode(), 1)); 8916 8917 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8918 return Sum; 8919} 8920 8921SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8922 DebugLoc dl = Op.getDebugLoc(); 8923 8924 if (!Subtarget->hasSSE2()) { 8925 SDValue Chain = Op.getOperand(0); 8926 SDValue Zero = DAG.getConstant(0, 8927 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8928 SDValue Ops[] = { 8929 DAG.getRegister(X86::ESP, MVT::i32), // Base 8930 DAG.getTargetConstant(1, MVT::i8), // Scale 8931 DAG.getRegister(0, MVT::i32), // Index 8932 DAG.getTargetConstant(0, MVT::i32), // Disp 8933 DAG.getRegister(0, MVT::i32), // Segment. 8934 Zero, 8935 Chain 8936 }; 8937 SDNode *Res = 8938 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8939 array_lengthof(Ops)); 8940 return SDValue(Res, 0); 8941 } 8942 8943 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8944 if (!isDev) 8945 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8946 8947 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8948 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8949 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8950 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8951 8952 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8953 if (!Op1 && !Op2 && !Op3 && Op4) 8954 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8955 8956 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8957 if (Op1 && !Op2 && !Op3 && !Op4) 8958 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8959 8960 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8961 // (MFENCE)>; 8962 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8963} 8964 8965SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8966 EVT T = Op.getValueType(); 8967 DebugLoc DL = Op.getDebugLoc(); 8968 unsigned Reg = 0; 8969 unsigned size = 0; 8970 switch(T.getSimpleVT().SimpleTy) { 8971 default: 8972 assert(false && "Invalid value type!"); 8973 case MVT::i8: Reg = X86::AL; size = 1; break; 8974 case MVT::i16: Reg = X86::AX; size = 2; break; 8975 case MVT::i32: Reg = X86::EAX; size = 4; break; 8976 case MVT::i64: 8977 assert(Subtarget->is64Bit() && "Node not type legal!"); 8978 Reg = X86::RAX; size = 8; 8979 break; 8980 } 8981 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8982 Op.getOperand(2), SDValue()); 8983 SDValue Ops[] = { cpIn.getValue(0), 8984 Op.getOperand(1), 8985 Op.getOperand(3), 8986 DAG.getTargetConstant(size, MVT::i8), 8987 cpIn.getValue(1) }; 8988 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8989 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8990 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8991 Ops, 5, T, MMO); 8992 SDValue cpOut = 8993 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8994 return cpOut; 8995} 8996 8997SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8998 SelectionDAG &DAG) const { 8999 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9000 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9001 SDValue TheChain = Op.getOperand(0); 9002 DebugLoc dl = Op.getDebugLoc(); 9003 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9004 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9005 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9006 rax.getValue(2)); 9007 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9008 DAG.getConstant(32, MVT::i8)); 9009 SDValue Ops[] = { 9010 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9011 rdx.getValue(1) 9012 }; 9013 return DAG.getMergeValues(Ops, 2, dl); 9014} 9015 9016SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9017 SelectionDAG &DAG) const { 9018 EVT SrcVT = Op.getOperand(0).getValueType(); 9019 EVT DstVT = Op.getValueType(); 9020 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9021 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9022 assert((DstVT == MVT::i64 || 9023 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9024 "Unexpected custom BITCAST"); 9025 // i64 <=> MMX conversions are Legal. 9026 if (SrcVT==MVT::i64 && DstVT.isVector()) 9027 return Op; 9028 if (DstVT==MVT::i64 && SrcVT.isVector()) 9029 return Op; 9030 // MMX <=> MMX conversions are Legal. 9031 if (SrcVT.isVector() && DstVT.isVector()) 9032 return Op; 9033 // All other conversions need to be expanded. 9034 return SDValue(); 9035} 9036 9037SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9038 SDNode *Node = Op.getNode(); 9039 DebugLoc dl = Node->getDebugLoc(); 9040 EVT T = Node->getValueType(0); 9041 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9042 DAG.getConstant(0, T), Node->getOperand(2)); 9043 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9044 cast<AtomicSDNode>(Node)->getMemoryVT(), 9045 Node->getOperand(0), 9046 Node->getOperand(1), negOp, 9047 cast<AtomicSDNode>(Node)->getSrcValue(), 9048 cast<AtomicSDNode>(Node)->getAlignment()); 9049} 9050 9051static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9052 EVT VT = Op.getNode()->getValueType(0); 9053 9054 // Let legalize expand this if it isn't a legal type yet. 9055 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9056 return SDValue(); 9057 9058 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9059 9060 unsigned Opc; 9061 bool ExtraOp = false; 9062 switch (Op.getOpcode()) { 9063 default: assert(0 && "Invalid code"); 9064 case ISD::ADDC: Opc = X86ISD::ADD; break; 9065 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9066 case ISD::SUBC: Opc = X86ISD::SUB; break; 9067 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9068 } 9069 9070 if (!ExtraOp) 9071 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9072 Op.getOperand(1)); 9073 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9074 Op.getOperand(1), Op.getOperand(2)); 9075} 9076 9077/// LowerOperation - Provide custom lowering hooks for some operations. 9078/// 9079SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9080 switch (Op.getOpcode()) { 9081 default: llvm_unreachable("Should not custom lower this!"); 9082 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9083 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9084 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9085 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9086 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9087 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9088 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9089 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9090 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9091 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9092 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9093 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9094 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9095 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9096 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9097 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9098 case ISD::SHL_PARTS: 9099 case ISD::SRA_PARTS: 9100 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 9101 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9102 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9103 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9104 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9105 case ISD::FABS: return LowerFABS(Op, DAG); 9106 case ISD::FNEG: return LowerFNEG(Op, DAG); 9107 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9108 case ISD::SETCC: return LowerSETCC(Op, DAG); 9109 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9110 case ISD::SELECT: return LowerSELECT(Op, DAG); 9111 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9112 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9113 case ISD::VASTART: return LowerVASTART(Op, DAG); 9114 case ISD::VAARG: return LowerVAARG(Op, DAG); 9115 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9116 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9117 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9118 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9119 case ISD::FRAME_TO_ARGS_OFFSET: 9120 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9121 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9122 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9123 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9124 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9125 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9126 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9127 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9128 case ISD::SHL: return LowerSHL(Op, DAG); 9129 case ISD::SADDO: 9130 case ISD::UADDO: 9131 case ISD::SSUBO: 9132 case ISD::USUBO: 9133 case ISD::SMULO: 9134 case ISD::UMULO: return LowerXALUO(Op, DAG); 9135 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9136 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9137 case ISD::ADDC: 9138 case ISD::ADDE: 9139 case ISD::SUBC: 9140 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9141 } 9142} 9143 9144void X86TargetLowering:: 9145ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9146 SelectionDAG &DAG, unsigned NewOp) const { 9147 EVT T = Node->getValueType(0); 9148 DebugLoc dl = Node->getDebugLoc(); 9149 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9150 9151 SDValue Chain = Node->getOperand(0); 9152 SDValue In1 = Node->getOperand(1); 9153 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9154 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9155 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9156 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9157 SDValue Ops[] = { Chain, In1, In2L, In2H }; 9158 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9159 SDValue Result = 9160 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 9161 cast<MemSDNode>(Node)->getMemOperand()); 9162 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 9163 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9164 Results.push_back(Result.getValue(2)); 9165} 9166 9167/// ReplaceNodeResults - Replace a node with an illegal result type 9168/// with a new node built out of custom code. 9169void X86TargetLowering::ReplaceNodeResults(SDNode *N, 9170 SmallVectorImpl<SDValue>&Results, 9171 SelectionDAG &DAG) const { 9172 DebugLoc dl = N->getDebugLoc(); 9173 switch (N->getOpcode()) { 9174 default: 9175 assert(false && "Do not know how to custom type legalize this operation!"); 9176 return; 9177 case ISD::ADDC: 9178 case ISD::ADDE: 9179 case ISD::SUBC: 9180 case ISD::SUBE: 9181 // We don't want to expand or promote these. 9182 return; 9183 case ISD::FP_TO_SINT: { 9184 std::pair<SDValue,SDValue> Vals = 9185 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 9186 SDValue FIST = Vals.first, StackSlot = Vals.second; 9187 if (FIST.getNode() != 0) { 9188 EVT VT = N->getValueType(0); 9189 // Return a load from the stack slot. 9190 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 9191 MachinePointerInfo(), false, false, 0)); 9192 } 9193 return; 9194 } 9195 case ISD::READCYCLECOUNTER: { 9196 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9197 SDValue TheChain = N->getOperand(0); 9198 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9199 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 9200 rd.getValue(1)); 9201 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 9202 eax.getValue(2)); 9203 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 9204 SDValue Ops[] = { eax, edx }; 9205 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 9206 Results.push_back(edx.getValue(1)); 9207 return; 9208 } 9209 case ISD::ATOMIC_CMP_SWAP: { 9210 EVT T = N->getValueType(0); 9211 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 9212 SDValue cpInL, cpInH; 9213 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9214 DAG.getConstant(0, MVT::i32)); 9215 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9216 DAG.getConstant(1, MVT::i32)); 9217 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 9218 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 9219 cpInL.getValue(1)); 9220 SDValue swapInL, swapInH; 9221 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9222 DAG.getConstant(0, MVT::i32)); 9223 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9224 DAG.getConstant(1, MVT::i32)); 9225 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 9226 cpInH.getValue(1)); 9227 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 9228 swapInL.getValue(1)); 9229 SDValue Ops[] = { swapInH.getValue(0), 9230 N->getOperand(1), 9231 swapInH.getValue(1) }; 9232 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9233 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 9234 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 9235 Ops, 3, T, MMO); 9236 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 9237 MVT::i32, Result.getValue(1)); 9238 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 9239 MVT::i32, cpOutL.getValue(2)); 9240 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 9241 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9242 Results.push_back(cpOutH.getValue(1)); 9243 return; 9244 } 9245 case ISD::ATOMIC_LOAD_ADD: 9246 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 9247 return; 9248 case ISD::ATOMIC_LOAD_AND: 9249 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 9250 return; 9251 case ISD::ATOMIC_LOAD_NAND: 9252 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 9253 return; 9254 case ISD::ATOMIC_LOAD_OR: 9255 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 9256 return; 9257 case ISD::ATOMIC_LOAD_SUB: 9258 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 9259 return; 9260 case ISD::ATOMIC_LOAD_XOR: 9261 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 9262 return; 9263 case ISD::ATOMIC_SWAP: 9264 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 9265 return; 9266 } 9267} 9268 9269const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 9270 switch (Opcode) { 9271 default: return NULL; 9272 case X86ISD::BSF: return "X86ISD::BSF"; 9273 case X86ISD::BSR: return "X86ISD::BSR"; 9274 case X86ISD::SHLD: return "X86ISD::SHLD"; 9275 case X86ISD::SHRD: return "X86ISD::SHRD"; 9276 case X86ISD::FAND: return "X86ISD::FAND"; 9277 case X86ISD::FOR: return "X86ISD::FOR"; 9278 case X86ISD::FXOR: return "X86ISD::FXOR"; 9279 case X86ISD::FSRL: return "X86ISD::FSRL"; 9280 case X86ISD::FILD: return "X86ISD::FILD"; 9281 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 9282 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 9283 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 9284 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 9285 case X86ISD::FLD: return "X86ISD::FLD"; 9286 case X86ISD::FST: return "X86ISD::FST"; 9287 case X86ISD::CALL: return "X86ISD::CALL"; 9288 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 9289 case X86ISD::BT: return "X86ISD::BT"; 9290 case X86ISD::CMP: return "X86ISD::CMP"; 9291 case X86ISD::COMI: return "X86ISD::COMI"; 9292 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 9293 case X86ISD::SETCC: return "X86ISD::SETCC"; 9294 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 9295 case X86ISD::CMOV: return "X86ISD::CMOV"; 9296 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 9297 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 9298 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 9299 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 9300 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 9301 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 9302 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 9303 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 9304 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 9305 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 9306 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 9307 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 9308 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 9309 case X86ISD::PANDN: return "X86ISD::PANDN"; 9310 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 9311 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 9312 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 9313 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 9314 case X86ISD::FMAX: return "X86ISD::FMAX"; 9315 case X86ISD::FMIN: return "X86ISD::FMIN"; 9316 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9317 case X86ISD::FRCP: return "X86ISD::FRCP"; 9318 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9319 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9320 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9321 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9322 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9323 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9324 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9325 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9326 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9327 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9328 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9329 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9330 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9331 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9332 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9333 case X86ISD::VSHL: return "X86ISD::VSHL"; 9334 case X86ISD::VSRL: return "X86ISD::VSRL"; 9335 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9336 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9337 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9338 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9339 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9340 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9341 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9342 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9343 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9344 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9345 case X86ISD::ADD: return "X86ISD::ADD"; 9346 case X86ISD::SUB: return "X86ISD::SUB"; 9347 case X86ISD::ADC: return "X86ISD::ADC"; 9348 case X86ISD::SBB: return "X86ISD::SBB"; 9349 case X86ISD::SMUL: return "X86ISD::SMUL"; 9350 case X86ISD::UMUL: return "X86ISD::UMUL"; 9351 case X86ISD::INC: return "X86ISD::INC"; 9352 case X86ISD::DEC: return "X86ISD::DEC"; 9353 case X86ISD::OR: return "X86ISD::OR"; 9354 case X86ISD::XOR: return "X86ISD::XOR"; 9355 case X86ISD::AND: return "X86ISD::AND"; 9356 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 9357 case X86ISD::PTEST: return "X86ISD::PTEST"; 9358 case X86ISD::TESTP: return "X86ISD::TESTP"; 9359 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 9360 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 9361 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 9362 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 9363 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 9364 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 9365 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 9366 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 9367 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 9368 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 9369 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 9370 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 9371 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 9372 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 9373 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 9374 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 9375 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 9376 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 9377 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 9378 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 9379 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 9380 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 9381 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 9382 case X86ISD::VUNPCKLPS: return "X86ISD::VUNPCKLPS"; 9383 case X86ISD::VUNPCKLPD: return "X86ISD::VUNPCKLPD"; 9384 case X86ISD::VUNPCKLPSY: return "X86ISD::VUNPCKLPSY"; 9385 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 9386 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 9387 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 9388 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 9389 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 9390 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 9391 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 9392 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9393 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9394 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9395 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9396 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9397 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9398 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9399 } 9400} 9401 9402// isLegalAddressingMode - Return true if the addressing mode represented 9403// by AM is legal for this target, for a load/store of the specified type. 9404bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9405 const Type *Ty) const { 9406 // X86 supports extremely general addressing modes. 9407 CodeModel::Model M = getTargetMachine().getCodeModel(); 9408 Reloc::Model R = getTargetMachine().getRelocationModel(); 9409 9410 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9411 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9412 return false; 9413 9414 if (AM.BaseGV) { 9415 unsigned GVFlags = 9416 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9417 9418 // If a reference to this global requires an extra load, we can't fold it. 9419 if (isGlobalStubReference(GVFlags)) 9420 return false; 9421 9422 // If BaseGV requires a register for the PIC base, we cannot also have a 9423 // BaseReg specified. 9424 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9425 return false; 9426 9427 // If lower 4G is not available, then we must use rip-relative addressing. 9428 if ((M != CodeModel::Small || R != Reloc::Static) && 9429 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9430 return false; 9431 } 9432 9433 switch (AM.Scale) { 9434 case 0: 9435 case 1: 9436 case 2: 9437 case 4: 9438 case 8: 9439 // These scales always work. 9440 break; 9441 case 3: 9442 case 5: 9443 case 9: 9444 // These scales are formed with basereg+scalereg. Only accept if there is 9445 // no basereg yet. 9446 if (AM.HasBaseReg) 9447 return false; 9448 break; 9449 default: // Other stuff never works. 9450 return false; 9451 } 9452 9453 return true; 9454} 9455 9456 9457bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 9458 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9459 return false; 9460 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9461 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9462 if (NumBits1 <= NumBits2) 9463 return false; 9464 return true; 9465} 9466 9467bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9468 if (!VT1.isInteger() || !VT2.isInteger()) 9469 return false; 9470 unsigned NumBits1 = VT1.getSizeInBits(); 9471 unsigned NumBits2 = VT2.getSizeInBits(); 9472 if (NumBits1 <= NumBits2) 9473 return false; 9474 return true; 9475} 9476 9477bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 9478 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9479 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9480} 9481 9482bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9483 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9484 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9485} 9486 9487bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9488 // i16 instructions are longer (0x66 prefix) and potentially slower. 9489 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9490} 9491 9492/// isShuffleMaskLegal - Targets can use this to indicate that they only 9493/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9494/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9495/// are assumed to be legal. 9496bool 9497X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9498 EVT VT) const { 9499 // Very little shuffling can be done for 64-bit vectors right now. 9500 if (VT.getSizeInBits() == 64) 9501 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9502 9503 // FIXME: pshufb, blends, shifts. 9504 return (VT.getVectorNumElements() == 2 || 9505 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9506 isMOVLMask(M, VT) || 9507 isSHUFPMask(M, VT) || 9508 isPSHUFDMask(M, VT) || 9509 isPSHUFHWMask(M, VT) || 9510 isPSHUFLWMask(M, VT) || 9511 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9512 isUNPCKLMask(M, VT) || 9513 isUNPCKHMask(M, VT) || 9514 isUNPCKL_v_undef_Mask(M, VT) || 9515 isUNPCKH_v_undef_Mask(M, VT)); 9516} 9517 9518bool 9519X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9520 EVT VT) const { 9521 unsigned NumElts = VT.getVectorNumElements(); 9522 // FIXME: This collection of masks seems suspect. 9523 if (NumElts == 2) 9524 return true; 9525 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9526 return (isMOVLMask(Mask, VT) || 9527 isCommutedMOVLMask(Mask, VT, true) || 9528 isSHUFPMask(Mask, VT) || 9529 isCommutedSHUFPMask(Mask, VT)); 9530 } 9531 return false; 9532} 9533 9534//===----------------------------------------------------------------------===// 9535// X86 Scheduler Hooks 9536//===----------------------------------------------------------------------===// 9537 9538// private utility function 9539MachineBasicBlock * 9540X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9541 MachineBasicBlock *MBB, 9542 unsigned regOpc, 9543 unsigned immOpc, 9544 unsigned LoadOpc, 9545 unsigned CXchgOpc, 9546 unsigned notOpc, 9547 unsigned EAXreg, 9548 TargetRegisterClass *RC, 9549 bool invSrc) const { 9550 // For the atomic bitwise operator, we generate 9551 // thisMBB: 9552 // newMBB: 9553 // ld t1 = [bitinstr.addr] 9554 // op t2 = t1, [bitinstr.val] 9555 // mov EAX = t1 9556 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9557 // bz newMBB 9558 // fallthrough -->nextMBB 9559 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9560 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9561 MachineFunction::iterator MBBIter = MBB; 9562 ++MBBIter; 9563 9564 /// First build the CFG 9565 MachineFunction *F = MBB->getParent(); 9566 MachineBasicBlock *thisMBB = MBB; 9567 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9568 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9569 F->insert(MBBIter, newMBB); 9570 F->insert(MBBIter, nextMBB); 9571 9572 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9573 nextMBB->splice(nextMBB->begin(), thisMBB, 9574 llvm::next(MachineBasicBlock::iterator(bInstr)), 9575 thisMBB->end()); 9576 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9577 9578 // Update thisMBB to fall through to newMBB 9579 thisMBB->addSuccessor(newMBB); 9580 9581 // newMBB jumps to itself and fall through to nextMBB 9582 newMBB->addSuccessor(nextMBB); 9583 newMBB->addSuccessor(newMBB); 9584 9585 // Insert instructions into newMBB based on incoming instruction 9586 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9587 "unexpected number of operands"); 9588 DebugLoc dl = bInstr->getDebugLoc(); 9589 MachineOperand& destOper = bInstr->getOperand(0); 9590 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9591 int numArgs = bInstr->getNumOperands() - 1; 9592 for (int i=0; i < numArgs; ++i) 9593 argOpers[i] = &bInstr->getOperand(i+1); 9594 9595 // x86 address has 4 operands: base, index, scale, and displacement 9596 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9597 int valArgIndx = lastAddrIndx + 1; 9598 9599 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9600 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9601 for (int i=0; i <= lastAddrIndx; ++i) 9602 (*MIB).addOperand(*argOpers[i]); 9603 9604 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9605 if (invSrc) { 9606 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9607 } 9608 else 9609 tt = t1; 9610 9611 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9612 assert((argOpers[valArgIndx]->isReg() || 9613 argOpers[valArgIndx]->isImm()) && 9614 "invalid operand"); 9615 if (argOpers[valArgIndx]->isReg()) 9616 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9617 else 9618 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9619 MIB.addReg(tt); 9620 (*MIB).addOperand(*argOpers[valArgIndx]); 9621 9622 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9623 MIB.addReg(t1); 9624 9625 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9626 for (int i=0; i <= lastAddrIndx; ++i) 9627 (*MIB).addOperand(*argOpers[i]); 9628 MIB.addReg(t2); 9629 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9630 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9631 bInstr->memoperands_end()); 9632 9633 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9634 MIB.addReg(EAXreg); 9635 9636 // insert branch 9637 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9638 9639 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9640 return nextMBB; 9641} 9642 9643// private utility function: 64 bit atomics on 32 bit host. 9644MachineBasicBlock * 9645X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9646 MachineBasicBlock *MBB, 9647 unsigned regOpcL, 9648 unsigned regOpcH, 9649 unsigned immOpcL, 9650 unsigned immOpcH, 9651 bool invSrc) const { 9652 // For the atomic bitwise operator, we generate 9653 // thisMBB (instructions are in pairs, except cmpxchg8b) 9654 // ld t1,t2 = [bitinstr.addr] 9655 // newMBB: 9656 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9657 // op t5, t6 <- out1, out2, [bitinstr.val] 9658 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9659 // mov ECX, EBX <- t5, t6 9660 // mov EAX, EDX <- t1, t2 9661 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9662 // mov t3, t4 <- EAX, EDX 9663 // bz newMBB 9664 // result in out1, out2 9665 // fallthrough -->nextMBB 9666 9667 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9668 const unsigned LoadOpc = X86::MOV32rm; 9669 const unsigned NotOpc = X86::NOT32r; 9670 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9671 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9672 MachineFunction::iterator MBBIter = MBB; 9673 ++MBBIter; 9674 9675 /// First build the CFG 9676 MachineFunction *F = MBB->getParent(); 9677 MachineBasicBlock *thisMBB = MBB; 9678 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9679 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9680 F->insert(MBBIter, newMBB); 9681 F->insert(MBBIter, nextMBB); 9682 9683 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9684 nextMBB->splice(nextMBB->begin(), thisMBB, 9685 llvm::next(MachineBasicBlock::iterator(bInstr)), 9686 thisMBB->end()); 9687 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9688 9689 // Update thisMBB to fall through to newMBB 9690 thisMBB->addSuccessor(newMBB); 9691 9692 // newMBB jumps to itself and fall through to nextMBB 9693 newMBB->addSuccessor(nextMBB); 9694 newMBB->addSuccessor(newMBB); 9695 9696 DebugLoc dl = bInstr->getDebugLoc(); 9697 // Insert instructions into newMBB based on incoming instruction 9698 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9699 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9700 "unexpected number of operands"); 9701 MachineOperand& dest1Oper = bInstr->getOperand(0); 9702 MachineOperand& dest2Oper = bInstr->getOperand(1); 9703 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9704 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9705 argOpers[i] = &bInstr->getOperand(i+2); 9706 9707 // We use some of the operands multiple times, so conservatively just 9708 // clear any kill flags that might be present. 9709 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9710 argOpers[i]->setIsKill(false); 9711 } 9712 9713 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9714 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9715 9716 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9717 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9718 for (int i=0; i <= lastAddrIndx; ++i) 9719 (*MIB).addOperand(*argOpers[i]); 9720 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9721 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9722 // add 4 to displacement. 9723 for (int i=0; i <= lastAddrIndx-2; ++i) 9724 (*MIB).addOperand(*argOpers[i]); 9725 MachineOperand newOp3 = *(argOpers[3]); 9726 if (newOp3.isImm()) 9727 newOp3.setImm(newOp3.getImm()+4); 9728 else 9729 newOp3.setOffset(newOp3.getOffset()+4); 9730 (*MIB).addOperand(newOp3); 9731 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9732 9733 // t3/4 are defined later, at the bottom of the loop 9734 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9735 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9736 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9737 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9738 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9739 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9740 9741 // The subsequent operations should be using the destination registers of 9742 //the PHI instructions. 9743 if (invSrc) { 9744 t1 = F->getRegInfo().createVirtualRegister(RC); 9745 t2 = F->getRegInfo().createVirtualRegister(RC); 9746 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9747 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9748 } else { 9749 t1 = dest1Oper.getReg(); 9750 t2 = dest2Oper.getReg(); 9751 } 9752 9753 int valArgIndx = lastAddrIndx + 1; 9754 assert((argOpers[valArgIndx]->isReg() || 9755 argOpers[valArgIndx]->isImm()) && 9756 "invalid operand"); 9757 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9758 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9759 if (argOpers[valArgIndx]->isReg()) 9760 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9761 else 9762 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9763 if (regOpcL != X86::MOV32rr) 9764 MIB.addReg(t1); 9765 (*MIB).addOperand(*argOpers[valArgIndx]); 9766 assert(argOpers[valArgIndx + 1]->isReg() == 9767 argOpers[valArgIndx]->isReg()); 9768 assert(argOpers[valArgIndx + 1]->isImm() == 9769 argOpers[valArgIndx]->isImm()); 9770 if (argOpers[valArgIndx + 1]->isReg()) 9771 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9772 else 9773 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9774 if (regOpcH != X86::MOV32rr) 9775 MIB.addReg(t2); 9776 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9777 9778 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9779 MIB.addReg(t1); 9780 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9781 MIB.addReg(t2); 9782 9783 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9784 MIB.addReg(t5); 9785 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9786 MIB.addReg(t6); 9787 9788 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9789 for (int i=0; i <= lastAddrIndx; ++i) 9790 (*MIB).addOperand(*argOpers[i]); 9791 9792 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9793 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9794 bInstr->memoperands_end()); 9795 9796 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9797 MIB.addReg(X86::EAX); 9798 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9799 MIB.addReg(X86::EDX); 9800 9801 // insert branch 9802 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9803 9804 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9805 return nextMBB; 9806} 9807 9808// private utility function 9809MachineBasicBlock * 9810X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9811 MachineBasicBlock *MBB, 9812 unsigned cmovOpc) const { 9813 // For the atomic min/max operator, we generate 9814 // thisMBB: 9815 // newMBB: 9816 // ld t1 = [min/max.addr] 9817 // mov t2 = [min/max.val] 9818 // cmp t1, t2 9819 // cmov[cond] t2 = t1 9820 // mov EAX = t1 9821 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9822 // bz newMBB 9823 // fallthrough -->nextMBB 9824 // 9825 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9826 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9827 MachineFunction::iterator MBBIter = MBB; 9828 ++MBBIter; 9829 9830 /// First build the CFG 9831 MachineFunction *F = MBB->getParent(); 9832 MachineBasicBlock *thisMBB = MBB; 9833 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9834 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9835 F->insert(MBBIter, newMBB); 9836 F->insert(MBBIter, nextMBB); 9837 9838 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9839 nextMBB->splice(nextMBB->begin(), thisMBB, 9840 llvm::next(MachineBasicBlock::iterator(mInstr)), 9841 thisMBB->end()); 9842 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9843 9844 // Update thisMBB to fall through to newMBB 9845 thisMBB->addSuccessor(newMBB); 9846 9847 // newMBB jumps to newMBB and fall through to nextMBB 9848 newMBB->addSuccessor(nextMBB); 9849 newMBB->addSuccessor(newMBB); 9850 9851 DebugLoc dl = mInstr->getDebugLoc(); 9852 // Insert instructions into newMBB based on incoming instruction 9853 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9854 "unexpected number of operands"); 9855 MachineOperand& destOper = mInstr->getOperand(0); 9856 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9857 int numArgs = mInstr->getNumOperands() - 1; 9858 for (int i=0; i < numArgs; ++i) 9859 argOpers[i] = &mInstr->getOperand(i+1); 9860 9861 // x86 address has 4 operands: base, index, scale, and displacement 9862 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9863 int valArgIndx = lastAddrIndx + 1; 9864 9865 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9866 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9867 for (int i=0; i <= lastAddrIndx; ++i) 9868 (*MIB).addOperand(*argOpers[i]); 9869 9870 // We only support register and immediate values 9871 assert((argOpers[valArgIndx]->isReg() || 9872 argOpers[valArgIndx]->isImm()) && 9873 "invalid operand"); 9874 9875 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9876 if (argOpers[valArgIndx]->isReg()) 9877 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9878 else 9879 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9880 (*MIB).addOperand(*argOpers[valArgIndx]); 9881 9882 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9883 MIB.addReg(t1); 9884 9885 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9886 MIB.addReg(t1); 9887 MIB.addReg(t2); 9888 9889 // Generate movc 9890 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9891 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9892 MIB.addReg(t2); 9893 MIB.addReg(t1); 9894 9895 // Cmp and exchange if none has modified the memory location 9896 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9897 for (int i=0; i <= lastAddrIndx; ++i) 9898 (*MIB).addOperand(*argOpers[i]); 9899 MIB.addReg(t3); 9900 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9901 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9902 mInstr->memoperands_end()); 9903 9904 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9905 MIB.addReg(X86::EAX); 9906 9907 // insert branch 9908 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9909 9910 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9911 return nextMBB; 9912} 9913 9914// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9915// or XMM0_V32I8 in AVX all of this code can be replaced with that 9916// in the .td file. 9917MachineBasicBlock * 9918X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9919 unsigned numArgs, bool memArg) const { 9920 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9921 "Target must have SSE4.2 or AVX features enabled"); 9922 9923 DebugLoc dl = MI->getDebugLoc(); 9924 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9925 unsigned Opc; 9926 if (!Subtarget->hasAVX()) { 9927 if (memArg) 9928 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9929 else 9930 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9931 } else { 9932 if (memArg) 9933 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9934 else 9935 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9936 } 9937 9938 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 9939 for (unsigned i = 0; i < numArgs; ++i) { 9940 MachineOperand &Op = MI->getOperand(i+1); 9941 if (!(Op.isReg() && Op.isImplicit())) 9942 MIB.addOperand(Op); 9943 } 9944 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9945 .addReg(X86::XMM0); 9946 9947 MI->eraseFromParent(); 9948 return BB; 9949} 9950 9951MachineBasicBlock * 9952X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 9953 DebugLoc dl = MI->getDebugLoc(); 9954 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9955 9956 // Address into RAX/EAX, other two args into ECX, EDX. 9957 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 9958 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 9959 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 9960 for (int i = 0; i < X86::AddrNumOperands; ++i) 9961 MIB.addOperand(MI->getOperand(i)); 9962 9963 unsigned ValOps = X86::AddrNumOperands; 9964 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9965 .addReg(MI->getOperand(ValOps).getReg()); 9966 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 9967 .addReg(MI->getOperand(ValOps+1).getReg()); 9968 9969 // The instruction doesn't actually take any operands though. 9970 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 9971 9972 MI->eraseFromParent(); // The pseudo is gone now. 9973 return BB; 9974} 9975 9976MachineBasicBlock * 9977X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 9978 DebugLoc dl = MI->getDebugLoc(); 9979 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9980 9981 // First arg in ECX, the second in EAX. 9982 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9983 .addReg(MI->getOperand(0).getReg()); 9984 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 9985 .addReg(MI->getOperand(1).getReg()); 9986 9987 // The instruction doesn't actually take any operands though. 9988 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 9989 9990 MI->eraseFromParent(); // The pseudo is gone now. 9991 return BB; 9992} 9993 9994MachineBasicBlock * 9995X86TargetLowering::EmitVAARG64WithCustomInserter( 9996 MachineInstr *MI, 9997 MachineBasicBlock *MBB) const { 9998 // Emit va_arg instruction on X86-64. 9999 10000 // Operands to this pseudo-instruction: 10001 // 0 ) Output : destination address (reg) 10002 // 1-5) Input : va_list address (addr, i64mem) 10003 // 6 ) ArgSize : Size (in bytes) of vararg type 10004 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10005 // 8 ) Align : Alignment of type 10006 // 9 ) EFLAGS (implicit-def) 10007 10008 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10009 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10010 10011 unsigned DestReg = MI->getOperand(0).getReg(); 10012 MachineOperand &Base = MI->getOperand(1); 10013 MachineOperand &Scale = MI->getOperand(2); 10014 MachineOperand &Index = MI->getOperand(3); 10015 MachineOperand &Disp = MI->getOperand(4); 10016 MachineOperand &Segment = MI->getOperand(5); 10017 unsigned ArgSize = MI->getOperand(6).getImm(); 10018 unsigned ArgMode = MI->getOperand(7).getImm(); 10019 unsigned Align = MI->getOperand(8).getImm(); 10020 10021 // Memory Reference 10022 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10023 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10024 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10025 10026 // Machine Information 10027 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10028 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10029 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10030 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10031 DebugLoc DL = MI->getDebugLoc(); 10032 10033 // struct va_list { 10034 // i32 gp_offset 10035 // i32 fp_offset 10036 // i64 overflow_area (address) 10037 // i64 reg_save_area (address) 10038 // } 10039 // sizeof(va_list) = 24 10040 // alignment(va_list) = 8 10041 10042 unsigned TotalNumIntRegs = 6; 10043 unsigned TotalNumXMMRegs = 8; 10044 bool UseGPOffset = (ArgMode == 1); 10045 bool UseFPOffset = (ArgMode == 2); 10046 unsigned MaxOffset = TotalNumIntRegs * 8 + 10047 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10048 10049 /* Align ArgSize to a multiple of 8 */ 10050 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10051 bool NeedsAlign = (Align > 8); 10052 10053 MachineBasicBlock *thisMBB = MBB; 10054 MachineBasicBlock *overflowMBB; 10055 MachineBasicBlock *offsetMBB; 10056 MachineBasicBlock *endMBB; 10057 10058 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10059 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10060 unsigned OffsetReg = 0; 10061 10062 if (!UseGPOffset && !UseFPOffset) { 10063 // If we only pull from the overflow region, we don't create a branch. 10064 // We don't need to alter control flow. 10065 OffsetDestReg = 0; // unused 10066 OverflowDestReg = DestReg; 10067 10068 offsetMBB = NULL; 10069 overflowMBB = thisMBB; 10070 endMBB = thisMBB; 10071 } else { 10072 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10073 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10074 // If not, pull from overflow_area. (branch to overflowMBB) 10075 // 10076 // thisMBB 10077 // | . 10078 // | . 10079 // offsetMBB overflowMBB 10080 // | . 10081 // | . 10082 // endMBB 10083 10084 // Registers for the PHI in endMBB 10085 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10086 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10087 10088 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10089 MachineFunction *MF = MBB->getParent(); 10090 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10091 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10092 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10093 10094 MachineFunction::iterator MBBIter = MBB; 10095 ++MBBIter; 10096 10097 // Insert the new basic blocks 10098 MF->insert(MBBIter, offsetMBB); 10099 MF->insert(MBBIter, overflowMBB); 10100 MF->insert(MBBIter, endMBB); 10101 10102 // Transfer the remainder of MBB and its successor edges to endMBB. 10103 endMBB->splice(endMBB->begin(), thisMBB, 10104 llvm::next(MachineBasicBlock::iterator(MI)), 10105 thisMBB->end()); 10106 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10107 10108 // Make offsetMBB and overflowMBB successors of thisMBB 10109 thisMBB->addSuccessor(offsetMBB); 10110 thisMBB->addSuccessor(overflowMBB); 10111 10112 // endMBB is a successor of both offsetMBB and overflowMBB 10113 offsetMBB->addSuccessor(endMBB); 10114 overflowMBB->addSuccessor(endMBB); 10115 10116 // Load the offset value into a register 10117 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10118 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10119 .addOperand(Base) 10120 .addOperand(Scale) 10121 .addOperand(Index) 10122 .addDisp(Disp, UseFPOffset ? 4 : 0) 10123 .addOperand(Segment) 10124 .setMemRefs(MMOBegin, MMOEnd); 10125 10126 // Check if there is enough room left to pull this argument. 10127 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10128 .addReg(OffsetReg) 10129 .addImm(MaxOffset + 8 - ArgSizeA8); 10130 10131 // Branch to "overflowMBB" if offset >= max 10132 // Fall through to "offsetMBB" otherwise 10133 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10134 .addMBB(overflowMBB); 10135 } 10136 10137 // In offsetMBB, emit code to use the reg_save_area. 10138 if (offsetMBB) { 10139 assert(OffsetReg != 0); 10140 10141 // Read the reg_save_area address. 10142 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10143 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10144 .addOperand(Base) 10145 .addOperand(Scale) 10146 .addOperand(Index) 10147 .addDisp(Disp, 16) 10148 .addOperand(Segment) 10149 .setMemRefs(MMOBegin, MMOEnd); 10150 10151 // Zero-extend the offset 10152 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 10153 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 10154 .addImm(0) 10155 .addReg(OffsetReg) 10156 .addImm(X86::sub_32bit); 10157 10158 // Add the offset to the reg_save_area to get the final address. 10159 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 10160 .addReg(OffsetReg64) 10161 .addReg(RegSaveReg); 10162 10163 // Compute the offset for the next argument 10164 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10165 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 10166 .addReg(OffsetReg) 10167 .addImm(UseFPOffset ? 16 : 8); 10168 10169 // Store it back into the va_list. 10170 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 10171 .addOperand(Base) 10172 .addOperand(Scale) 10173 .addOperand(Index) 10174 .addDisp(Disp, UseFPOffset ? 4 : 0) 10175 .addOperand(Segment) 10176 .addReg(NextOffsetReg) 10177 .setMemRefs(MMOBegin, MMOEnd); 10178 10179 // Jump to endMBB 10180 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 10181 .addMBB(endMBB); 10182 } 10183 10184 // 10185 // Emit code to use overflow area 10186 // 10187 10188 // Load the overflow_area address into a register. 10189 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 10190 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 10191 .addOperand(Base) 10192 .addOperand(Scale) 10193 .addOperand(Index) 10194 .addDisp(Disp, 8) 10195 .addOperand(Segment) 10196 .setMemRefs(MMOBegin, MMOEnd); 10197 10198 // If we need to align it, do so. Otherwise, just copy the address 10199 // to OverflowDestReg. 10200 if (NeedsAlign) { 10201 // Align the overflow address 10202 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 10203 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 10204 10205 // aligned_addr = (addr + (align-1)) & ~(align-1) 10206 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 10207 .addReg(OverflowAddrReg) 10208 .addImm(Align-1); 10209 10210 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 10211 .addReg(TmpReg) 10212 .addImm(~(uint64_t)(Align-1)); 10213 } else { 10214 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 10215 .addReg(OverflowAddrReg); 10216 } 10217 10218 // Compute the next overflow address after this argument. 10219 // (the overflow address should be kept 8-byte aligned) 10220 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 10221 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 10222 .addReg(OverflowDestReg) 10223 .addImm(ArgSizeA8); 10224 10225 // Store the new overflow address. 10226 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 10227 .addOperand(Base) 10228 .addOperand(Scale) 10229 .addOperand(Index) 10230 .addDisp(Disp, 8) 10231 .addOperand(Segment) 10232 .addReg(NextAddrReg) 10233 .setMemRefs(MMOBegin, MMOEnd); 10234 10235 // If we branched, emit the PHI to the front of endMBB. 10236 if (offsetMBB) { 10237 BuildMI(*endMBB, endMBB->begin(), DL, 10238 TII->get(X86::PHI), DestReg) 10239 .addReg(OffsetDestReg).addMBB(offsetMBB) 10240 .addReg(OverflowDestReg).addMBB(overflowMBB); 10241 } 10242 10243 // Erase the pseudo instruction 10244 MI->eraseFromParent(); 10245 10246 return endMBB; 10247} 10248 10249MachineBasicBlock * 10250X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 10251 MachineInstr *MI, 10252 MachineBasicBlock *MBB) const { 10253 // Emit code to save XMM registers to the stack. The ABI says that the 10254 // number of registers to save is given in %al, so it's theoretically 10255 // possible to do an indirect jump trick to avoid saving all of them, 10256 // however this code takes a simpler approach and just executes all 10257 // of the stores if %al is non-zero. It's less code, and it's probably 10258 // easier on the hardware branch predictor, and stores aren't all that 10259 // expensive anyway. 10260 10261 // Create the new basic blocks. One block contains all the XMM stores, 10262 // and one block is the final destination regardless of whether any 10263 // stores were performed. 10264 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10265 MachineFunction *F = MBB->getParent(); 10266 MachineFunction::iterator MBBIter = MBB; 10267 ++MBBIter; 10268 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 10269 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 10270 F->insert(MBBIter, XMMSaveMBB); 10271 F->insert(MBBIter, EndMBB); 10272 10273 // Transfer the remainder of MBB and its successor edges to EndMBB. 10274 EndMBB->splice(EndMBB->begin(), MBB, 10275 llvm::next(MachineBasicBlock::iterator(MI)), 10276 MBB->end()); 10277 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 10278 10279 // The original block will now fall through to the XMM save block. 10280 MBB->addSuccessor(XMMSaveMBB); 10281 // The XMMSaveMBB will fall through to the end block. 10282 XMMSaveMBB->addSuccessor(EndMBB); 10283 10284 // Now add the instructions. 10285 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10286 DebugLoc DL = MI->getDebugLoc(); 10287 10288 unsigned CountReg = MI->getOperand(0).getReg(); 10289 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 10290 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 10291 10292 if (!Subtarget->isTargetWin64()) { 10293 // If %al is 0, branch around the XMM save block. 10294 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 10295 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 10296 MBB->addSuccessor(EndMBB); 10297 } 10298 10299 // In the XMM save block, save all the XMM argument registers. 10300 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 10301 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 10302 MachineMemOperand *MMO = 10303 F->getMachineMemOperand( 10304 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 10305 MachineMemOperand::MOStore, 10306 /*Size=*/16, /*Align=*/16); 10307 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 10308 .addFrameIndex(RegSaveFrameIndex) 10309 .addImm(/*Scale=*/1) 10310 .addReg(/*IndexReg=*/0) 10311 .addImm(/*Disp=*/Offset) 10312 .addReg(/*Segment=*/0) 10313 .addReg(MI->getOperand(i).getReg()) 10314 .addMemOperand(MMO); 10315 } 10316 10317 MI->eraseFromParent(); // The pseudo instruction is gone now. 10318 10319 return EndMBB; 10320} 10321 10322MachineBasicBlock * 10323X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10324 MachineBasicBlock *BB) const { 10325 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10326 DebugLoc DL = MI->getDebugLoc(); 10327 10328 // To "insert" a SELECT_CC instruction, we actually have to insert the 10329 // diamond control-flow pattern. The incoming instruction knows the 10330 // destination vreg to set, the condition code register to branch on, the 10331 // true/false values to select between, and a branch opcode to use. 10332 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10333 MachineFunction::iterator It = BB; 10334 ++It; 10335 10336 // thisMBB: 10337 // ... 10338 // TrueVal = ... 10339 // cmpTY ccX, r1, r2 10340 // bCC copy1MBB 10341 // fallthrough --> copy0MBB 10342 MachineBasicBlock *thisMBB = BB; 10343 MachineFunction *F = BB->getParent(); 10344 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10345 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10346 F->insert(It, copy0MBB); 10347 F->insert(It, sinkMBB); 10348 10349 // If the EFLAGS register isn't dead in the terminator, then claim that it's 10350 // live into the sink and copy blocks. 10351 const MachineFunction *MF = BB->getParent(); 10352 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 10353 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 10354 10355 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 10356 const MachineOperand &MO = MI->getOperand(I); 10357 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 10358 unsigned Reg = MO.getReg(); 10359 if (Reg != X86::EFLAGS) continue; 10360 copy0MBB->addLiveIn(Reg); 10361 sinkMBB->addLiveIn(Reg); 10362 } 10363 10364 // Transfer the remainder of BB and its successor edges to sinkMBB. 10365 sinkMBB->splice(sinkMBB->begin(), BB, 10366 llvm::next(MachineBasicBlock::iterator(MI)), 10367 BB->end()); 10368 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10369 10370 // Add the true and fallthrough blocks as its successors. 10371 BB->addSuccessor(copy0MBB); 10372 BB->addSuccessor(sinkMBB); 10373 10374 // Create the conditional branch instruction. 10375 unsigned Opc = 10376 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 10377 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 10378 10379 // copy0MBB: 10380 // %FalseValue = ... 10381 // # fallthrough to sinkMBB 10382 copy0MBB->addSuccessor(sinkMBB); 10383 10384 // sinkMBB: 10385 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10386 // ... 10387 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10388 TII->get(X86::PHI), MI->getOperand(0).getReg()) 10389 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 10390 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 10391 10392 MI->eraseFromParent(); // The pseudo instruction is gone now. 10393 return sinkMBB; 10394} 10395 10396MachineBasicBlock * 10397X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10398 MachineBasicBlock *BB) const { 10399 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10400 DebugLoc DL = MI->getDebugLoc(); 10401 10402 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10403 // non-trivial part is impdef of ESP. 10404 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 10405 // mingw-w64. 10406 10407 const char *StackProbeSymbol = 10408 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10409 10410 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10411 .addExternalSymbol(StackProbeSymbol) 10412 .addReg(X86::EAX, RegState::Implicit) 10413 .addReg(X86::ESP, RegState::Implicit) 10414 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10415 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10416 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10417 10418 MI->eraseFromParent(); // The pseudo instruction is gone now. 10419 return BB; 10420} 10421 10422MachineBasicBlock * 10423X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10424 MachineBasicBlock *BB) const { 10425 // This is pretty easy. We're taking the value that we received from 10426 // our load from the relocation, sticking it in either RDI (x86-64) 10427 // or EAX and doing an indirect call. The return value will then 10428 // be in the normal return register. 10429 const X86InstrInfo *TII 10430 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10431 DebugLoc DL = MI->getDebugLoc(); 10432 MachineFunction *F = BB->getParent(); 10433 10434 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10435 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10436 10437 if (Subtarget->is64Bit()) { 10438 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10439 TII->get(X86::MOV64rm), X86::RDI) 10440 .addReg(X86::RIP) 10441 .addImm(0).addReg(0) 10442 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10443 MI->getOperand(3).getTargetFlags()) 10444 .addReg(0); 10445 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10446 addDirectMem(MIB, X86::RDI); 10447 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10448 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10449 TII->get(X86::MOV32rm), X86::EAX) 10450 .addReg(0) 10451 .addImm(0).addReg(0) 10452 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10453 MI->getOperand(3).getTargetFlags()) 10454 .addReg(0); 10455 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10456 addDirectMem(MIB, X86::EAX); 10457 } else { 10458 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10459 TII->get(X86::MOV32rm), X86::EAX) 10460 .addReg(TII->getGlobalBaseReg(F)) 10461 .addImm(0).addReg(0) 10462 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10463 MI->getOperand(3).getTargetFlags()) 10464 .addReg(0); 10465 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10466 addDirectMem(MIB, X86::EAX); 10467 } 10468 10469 MI->eraseFromParent(); // The pseudo instruction is gone now. 10470 return BB; 10471} 10472 10473MachineBasicBlock * 10474X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10475 MachineBasicBlock *BB) const { 10476 switch (MI->getOpcode()) { 10477 default: assert(false && "Unexpected instr type to insert"); 10478 case X86::TAILJMPd64: 10479 case X86::TAILJMPr64: 10480 case X86::TAILJMPm64: 10481 assert(!"TAILJMP64 would not be touched here."); 10482 case X86::TCRETURNdi64: 10483 case X86::TCRETURNri64: 10484 case X86::TCRETURNmi64: 10485 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 10486 // On AMD64, additional defs should be added before register allocation. 10487 if (!Subtarget->isTargetWin64()) { 10488 MI->addRegisterDefined(X86::RSI); 10489 MI->addRegisterDefined(X86::RDI); 10490 MI->addRegisterDefined(X86::XMM6); 10491 MI->addRegisterDefined(X86::XMM7); 10492 MI->addRegisterDefined(X86::XMM8); 10493 MI->addRegisterDefined(X86::XMM9); 10494 MI->addRegisterDefined(X86::XMM10); 10495 MI->addRegisterDefined(X86::XMM11); 10496 MI->addRegisterDefined(X86::XMM12); 10497 MI->addRegisterDefined(X86::XMM13); 10498 MI->addRegisterDefined(X86::XMM14); 10499 MI->addRegisterDefined(X86::XMM15); 10500 } 10501 return BB; 10502 case X86::WIN_ALLOCA: 10503 return EmitLoweredWinAlloca(MI, BB); 10504 case X86::TLSCall_32: 10505 case X86::TLSCall_64: 10506 return EmitLoweredTLSCall(MI, BB); 10507 case X86::CMOV_GR8: 10508 case X86::CMOV_FR32: 10509 case X86::CMOV_FR64: 10510 case X86::CMOV_V4F32: 10511 case X86::CMOV_V2F64: 10512 case X86::CMOV_V2I64: 10513 case X86::CMOV_GR16: 10514 case X86::CMOV_GR32: 10515 case X86::CMOV_RFP32: 10516 case X86::CMOV_RFP64: 10517 case X86::CMOV_RFP80: 10518 return EmitLoweredSelect(MI, BB); 10519 10520 case X86::FP32_TO_INT16_IN_MEM: 10521 case X86::FP32_TO_INT32_IN_MEM: 10522 case X86::FP32_TO_INT64_IN_MEM: 10523 case X86::FP64_TO_INT16_IN_MEM: 10524 case X86::FP64_TO_INT32_IN_MEM: 10525 case X86::FP64_TO_INT64_IN_MEM: 10526 case X86::FP80_TO_INT16_IN_MEM: 10527 case X86::FP80_TO_INT32_IN_MEM: 10528 case X86::FP80_TO_INT64_IN_MEM: { 10529 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10530 DebugLoc DL = MI->getDebugLoc(); 10531 10532 // Change the floating point control register to use "round towards zero" 10533 // mode when truncating to an integer value. 10534 MachineFunction *F = BB->getParent(); 10535 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10536 addFrameReference(BuildMI(*BB, MI, DL, 10537 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10538 10539 // Load the old value of the high byte of the control word... 10540 unsigned OldCW = 10541 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10542 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10543 CWFrameIdx); 10544 10545 // Set the high part to be round to zero... 10546 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10547 .addImm(0xC7F); 10548 10549 // Reload the modified control word now... 10550 addFrameReference(BuildMI(*BB, MI, DL, 10551 TII->get(X86::FLDCW16m)), CWFrameIdx); 10552 10553 // Restore the memory image of control word to original value 10554 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10555 .addReg(OldCW); 10556 10557 // Get the X86 opcode to use. 10558 unsigned Opc; 10559 switch (MI->getOpcode()) { 10560 default: llvm_unreachable("illegal opcode!"); 10561 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10562 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10563 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10564 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10565 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10566 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10567 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10568 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10569 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10570 } 10571 10572 X86AddressMode AM; 10573 MachineOperand &Op = MI->getOperand(0); 10574 if (Op.isReg()) { 10575 AM.BaseType = X86AddressMode::RegBase; 10576 AM.Base.Reg = Op.getReg(); 10577 } else { 10578 AM.BaseType = X86AddressMode::FrameIndexBase; 10579 AM.Base.FrameIndex = Op.getIndex(); 10580 } 10581 Op = MI->getOperand(1); 10582 if (Op.isImm()) 10583 AM.Scale = Op.getImm(); 10584 Op = MI->getOperand(2); 10585 if (Op.isImm()) 10586 AM.IndexReg = Op.getImm(); 10587 Op = MI->getOperand(3); 10588 if (Op.isGlobal()) { 10589 AM.GV = Op.getGlobal(); 10590 } else { 10591 AM.Disp = Op.getImm(); 10592 } 10593 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10594 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10595 10596 // Reload the original control word now. 10597 addFrameReference(BuildMI(*BB, MI, DL, 10598 TII->get(X86::FLDCW16m)), CWFrameIdx); 10599 10600 MI->eraseFromParent(); // The pseudo instruction is gone now. 10601 return BB; 10602 } 10603 // String/text processing lowering. 10604 case X86::PCMPISTRM128REG: 10605 case X86::VPCMPISTRM128REG: 10606 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10607 case X86::PCMPISTRM128MEM: 10608 case X86::VPCMPISTRM128MEM: 10609 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10610 case X86::PCMPESTRM128REG: 10611 case X86::VPCMPESTRM128REG: 10612 return EmitPCMP(MI, BB, 5, false /* in mem */); 10613 case X86::PCMPESTRM128MEM: 10614 case X86::VPCMPESTRM128MEM: 10615 return EmitPCMP(MI, BB, 5, true /* in mem */); 10616 10617 // Thread synchronization. 10618 case X86::MONITOR: 10619 return EmitMonitor(MI, BB); 10620 case X86::MWAIT: 10621 return EmitMwait(MI, BB); 10622 10623 // Atomic Lowering. 10624 case X86::ATOMAND32: 10625 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10626 X86::AND32ri, X86::MOV32rm, 10627 X86::LCMPXCHG32, 10628 X86::NOT32r, X86::EAX, 10629 X86::GR32RegisterClass); 10630 case X86::ATOMOR32: 10631 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10632 X86::OR32ri, X86::MOV32rm, 10633 X86::LCMPXCHG32, 10634 X86::NOT32r, X86::EAX, 10635 X86::GR32RegisterClass); 10636 case X86::ATOMXOR32: 10637 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10638 X86::XOR32ri, X86::MOV32rm, 10639 X86::LCMPXCHG32, 10640 X86::NOT32r, X86::EAX, 10641 X86::GR32RegisterClass); 10642 case X86::ATOMNAND32: 10643 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10644 X86::AND32ri, X86::MOV32rm, 10645 X86::LCMPXCHG32, 10646 X86::NOT32r, X86::EAX, 10647 X86::GR32RegisterClass, true); 10648 case X86::ATOMMIN32: 10649 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10650 case X86::ATOMMAX32: 10651 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10652 case X86::ATOMUMIN32: 10653 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10654 case X86::ATOMUMAX32: 10655 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10656 10657 case X86::ATOMAND16: 10658 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10659 X86::AND16ri, X86::MOV16rm, 10660 X86::LCMPXCHG16, 10661 X86::NOT16r, X86::AX, 10662 X86::GR16RegisterClass); 10663 case X86::ATOMOR16: 10664 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10665 X86::OR16ri, X86::MOV16rm, 10666 X86::LCMPXCHG16, 10667 X86::NOT16r, X86::AX, 10668 X86::GR16RegisterClass); 10669 case X86::ATOMXOR16: 10670 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10671 X86::XOR16ri, X86::MOV16rm, 10672 X86::LCMPXCHG16, 10673 X86::NOT16r, X86::AX, 10674 X86::GR16RegisterClass); 10675 case X86::ATOMNAND16: 10676 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10677 X86::AND16ri, X86::MOV16rm, 10678 X86::LCMPXCHG16, 10679 X86::NOT16r, X86::AX, 10680 X86::GR16RegisterClass, true); 10681 case X86::ATOMMIN16: 10682 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10683 case X86::ATOMMAX16: 10684 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10685 case X86::ATOMUMIN16: 10686 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10687 case X86::ATOMUMAX16: 10688 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10689 10690 case X86::ATOMAND8: 10691 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10692 X86::AND8ri, X86::MOV8rm, 10693 X86::LCMPXCHG8, 10694 X86::NOT8r, X86::AL, 10695 X86::GR8RegisterClass); 10696 case X86::ATOMOR8: 10697 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10698 X86::OR8ri, X86::MOV8rm, 10699 X86::LCMPXCHG8, 10700 X86::NOT8r, X86::AL, 10701 X86::GR8RegisterClass); 10702 case X86::ATOMXOR8: 10703 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10704 X86::XOR8ri, X86::MOV8rm, 10705 X86::LCMPXCHG8, 10706 X86::NOT8r, X86::AL, 10707 X86::GR8RegisterClass); 10708 case X86::ATOMNAND8: 10709 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10710 X86::AND8ri, X86::MOV8rm, 10711 X86::LCMPXCHG8, 10712 X86::NOT8r, X86::AL, 10713 X86::GR8RegisterClass, true); 10714 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10715 // This group is for 64-bit host. 10716 case X86::ATOMAND64: 10717 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10718 X86::AND64ri32, X86::MOV64rm, 10719 X86::LCMPXCHG64, 10720 X86::NOT64r, X86::RAX, 10721 X86::GR64RegisterClass); 10722 case X86::ATOMOR64: 10723 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10724 X86::OR64ri32, X86::MOV64rm, 10725 X86::LCMPXCHG64, 10726 X86::NOT64r, X86::RAX, 10727 X86::GR64RegisterClass); 10728 case X86::ATOMXOR64: 10729 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10730 X86::XOR64ri32, X86::MOV64rm, 10731 X86::LCMPXCHG64, 10732 X86::NOT64r, X86::RAX, 10733 X86::GR64RegisterClass); 10734 case X86::ATOMNAND64: 10735 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10736 X86::AND64ri32, X86::MOV64rm, 10737 X86::LCMPXCHG64, 10738 X86::NOT64r, X86::RAX, 10739 X86::GR64RegisterClass, true); 10740 case X86::ATOMMIN64: 10741 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10742 case X86::ATOMMAX64: 10743 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10744 case X86::ATOMUMIN64: 10745 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10746 case X86::ATOMUMAX64: 10747 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10748 10749 // This group does 64-bit operations on a 32-bit host. 10750 case X86::ATOMAND6432: 10751 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10752 X86::AND32rr, X86::AND32rr, 10753 X86::AND32ri, X86::AND32ri, 10754 false); 10755 case X86::ATOMOR6432: 10756 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10757 X86::OR32rr, X86::OR32rr, 10758 X86::OR32ri, X86::OR32ri, 10759 false); 10760 case X86::ATOMXOR6432: 10761 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10762 X86::XOR32rr, X86::XOR32rr, 10763 X86::XOR32ri, X86::XOR32ri, 10764 false); 10765 case X86::ATOMNAND6432: 10766 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10767 X86::AND32rr, X86::AND32rr, 10768 X86::AND32ri, X86::AND32ri, 10769 true); 10770 case X86::ATOMADD6432: 10771 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10772 X86::ADD32rr, X86::ADC32rr, 10773 X86::ADD32ri, X86::ADC32ri, 10774 false); 10775 case X86::ATOMSUB6432: 10776 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10777 X86::SUB32rr, X86::SBB32rr, 10778 X86::SUB32ri, X86::SBB32ri, 10779 false); 10780 case X86::ATOMSWAP6432: 10781 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10782 X86::MOV32rr, X86::MOV32rr, 10783 X86::MOV32ri, X86::MOV32ri, 10784 false); 10785 case X86::VASTART_SAVE_XMM_REGS: 10786 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10787 10788 case X86::VAARG_64: 10789 return EmitVAARG64WithCustomInserter(MI, BB); 10790 } 10791} 10792 10793//===----------------------------------------------------------------------===// 10794// X86 Optimization Hooks 10795//===----------------------------------------------------------------------===// 10796 10797void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10798 const APInt &Mask, 10799 APInt &KnownZero, 10800 APInt &KnownOne, 10801 const SelectionDAG &DAG, 10802 unsigned Depth) const { 10803 unsigned Opc = Op.getOpcode(); 10804 assert((Opc >= ISD::BUILTIN_OP_END || 10805 Opc == ISD::INTRINSIC_WO_CHAIN || 10806 Opc == ISD::INTRINSIC_W_CHAIN || 10807 Opc == ISD::INTRINSIC_VOID) && 10808 "Should use MaskedValueIsZero if you don't know whether Op" 10809 " is a target node!"); 10810 10811 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10812 switch (Opc) { 10813 default: break; 10814 case X86ISD::ADD: 10815 case X86ISD::SUB: 10816 case X86ISD::ADC: 10817 case X86ISD::SBB: 10818 case X86ISD::SMUL: 10819 case X86ISD::UMUL: 10820 case X86ISD::INC: 10821 case X86ISD::DEC: 10822 case X86ISD::OR: 10823 case X86ISD::XOR: 10824 case X86ISD::AND: 10825 // These nodes' second result is a boolean. 10826 if (Op.getResNo() == 0) 10827 break; 10828 // Fallthrough 10829 case X86ISD::SETCC: 10830 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10831 Mask.getBitWidth() - 1); 10832 break; 10833 } 10834} 10835 10836unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10837 unsigned Depth) const { 10838 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10839 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10840 return Op.getValueType().getScalarType().getSizeInBits(); 10841 10842 // Fallback case. 10843 return 1; 10844} 10845 10846/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10847/// node is a GlobalAddress + offset. 10848bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10849 const GlobalValue* &GA, 10850 int64_t &Offset) const { 10851 if (N->getOpcode() == X86ISD::Wrapper) { 10852 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10853 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10854 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10855 return true; 10856 } 10857 } 10858 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10859} 10860 10861/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10862/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10863/// if the load addresses are consecutive, non-overlapping, and in the right 10864/// order. 10865static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10866 TargetLowering::DAGCombinerInfo &DCI) { 10867 DebugLoc dl = N->getDebugLoc(); 10868 EVT VT = N->getValueType(0); 10869 10870 if (VT.getSizeInBits() != 128) 10871 return SDValue(); 10872 10873 // Don't create instructions with illegal types after legalize types has run. 10874 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10875 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 10876 return SDValue(); 10877 10878 SmallVector<SDValue, 16> Elts; 10879 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10880 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10881 10882 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10883} 10884 10885/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10886/// generation and convert it from being a bunch of shuffles and extracts 10887/// to a simple store and scalar loads to extract the elements. 10888static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10889 const TargetLowering &TLI) { 10890 SDValue InputVector = N->getOperand(0); 10891 10892 // Only operate on vectors of 4 elements, where the alternative shuffling 10893 // gets to be more expensive. 10894 if (InputVector.getValueType() != MVT::v4i32) 10895 return SDValue(); 10896 10897 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10898 // single use which is a sign-extend or zero-extend, and all elements are 10899 // used. 10900 SmallVector<SDNode *, 4> Uses; 10901 unsigned ExtractedElements = 0; 10902 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10903 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10904 if (UI.getUse().getResNo() != InputVector.getResNo()) 10905 return SDValue(); 10906 10907 SDNode *Extract = *UI; 10908 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10909 return SDValue(); 10910 10911 if (Extract->getValueType(0) != MVT::i32) 10912 return SDValue(); 10913 if (!Extract->hasOneUse()) 10914 return SDValue(); 10915 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10916 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10917 return SDValue(); 10918 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10919 return SDValue(); 10920 10921 // Record which element was extracted. 10922 ExtractedElements |= 10923 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10924 10925 Uses.push_back(Extract); 10926 } 10927 10928 // If not all the elements were used, this may not be worthwhile. 10929 if (ExtractedElements != 15) 10930 return SDValue(); 10931 10932 // Ok, we've now decided to do the transformation. 10933 DebugLoc dl = InputVector.getDebugLoc(); 10934 10935 // Store the value to a temporary stack slot. 10936 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10937 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10938 MachinePointerInfo(), false, false, 0); 10939 10940 // Replace each use (extract) with a load of the appropriate element. 10941 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10942 UE = Uses.end(); UI != UE; ++UI) { 10943 SDNode *Extract = *UI; 10944 10945 // Compute the element's address. 10946 SDValue Idx = Extract->getOperand(1); 10947 unsigned EltSize = 10948 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10949 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10950 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10951 10952 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10953 StackPtr, OffsetVal); 10954 10955 // Load the scalar. 10956 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10957 ScalarAddr, MachinePointerInfo(), 10958 false, false, 0); 10959 10960 // Replace the exact with the load. 10961 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10962 } 10963 10964 // The replacement was made in place; don't return anything. 10965 return SDValue(); 10966} 10967 10968/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10969static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10970 const X86Subtarget *Subtarget) { 10971 DebugLoc DL = N->getDebugLoc(); 10972 SDValue Cond = N->getOperand(0); 10973 // Get the LHS/RHS of the select. 10974 SDValue LHS = N->getOperand(1); 10975 SDValue RHS = N->getOperand(2); 10976 10977 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10978 // instructions match the semantics of the common C idiom x<y?x:y but not 10979 // x<=y?x:y, because of how they handle negative zero (which can be 10980 // ignored in unsafe-math mode). 10981 if (Subtarget->hasSSE2() && 10982 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10983 Cond.getOpcode() == ISD::SETCC) { 10984 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10985 10986 unsigned Opcode = 0; 10987 // Check for x CC y ? x : y. 10988 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10989 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10990 switch (CC) { 10991 default: break; 10992 case ISD::SETULT: 10993 // Converting this to a min would handle NaNs incorrectly, and swapping 10994 // the operands would cause it to handle comparisons between positive 10995 // and negative zero incorrectly. 10996 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10997 if (!UnsafeFPMath && 10998 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10999 break; 11000 std::swap(LHS, RHS); 11001 } 11002 Opcode = X86ISD::FMIN; 11003 break; 11004 case ISD::SETOLE: 11005 // Converting this to a min would handle comparisons between positive 11006 // and negative zero incorrectly. 11007 if (!UnsafeFPMath && 11008 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11009 break; 11010 Opcode = X86ISD::FMIN; 11011 break; 11012 case ISD::SETULE: 11013 // Converting this to a min would handle both negative zeros and NaNs 11014 // incorrectly, but we can swap the operands to fix both. 11015 std::swap(LHS, RHS); 11016 case ISD::SETOLT: 11017 case ISD::SETLT: 11018 case ISD::SETLE: 11019 Opcode = X86ISD::FMIN; 11020 break; 11021 11022 case ISD::SETOGE: 11023 // Converting this to a max would handle comparisons between positive 11024 // and negative zero incorrectly. 11025 if (!UnsafeFPMath && 11026 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 11027 break; 11028 Opcode = X86ISD::FMAX; 11029 break; 11030 case ISD::SETUGT: 11031 // Converting this to a max would handle NaNs incorrectly, and swapping 11032 // the operands would cause it to handle comparisons between positive 11033 // and negative zero incorrectly. 11034 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11035 if (!UnsafeFPMath && 11036 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11037 break; 11038 std::swap(LHS, RHS); 11039 } 11040 Opcode = X86ISD::FMAX; 11041 break; 11042 case ISD::SETUGE: 11043 // Converting this to a max would handle both negative zeros and NaNs 11044 // incorrectly, but we can swap the operands to fix both. 11045 std::swap(LHS, RHS); 11046 case ISD::SETOGT: 11047 case ISD::SETGT: 11048 case ISD::SETGE: 11049 Opcode = X86ISD::FMAX; 11050 break; 11051 } 11052 // Check for x CC y ? y : x -- a min/max with reversed arms. 11053 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 11054 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 11055 switch (CC) { 11056 default: break; 11057 case ISD::SETOGE: 11058 // Converting this to a min would handle comparisons between positive 11059 // and negative zero incorrectly, and swapping the operands would 11060 // cause it to handle NaNs incorrectly. 11061 if (!UnsafeFPMath && 11062 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 11063 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11064 break; 11065 std::swap(LHS, RHS); 11066 } 11067 Opcode = X86ISD::FMIN; 11068 break; 11069 case ISD::SETUGT: 11070 // Converting this to a min would handle NaNs incorrectly. 11071 if (!UnsafeFPMath && 11072 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 11073 break; 11074 Opcode = X86ISD::FMIN; 11075 break; 11076 case ISD::SETUGE: 11077 // Converting this to a min would handle both negative zeros and NaNs 11078 // incorrectly, but we can swap the operands to fix both. 11079 std::swap(LHS, RHS); 11080 case ISD::SETOGT: 11081 case ISD::SETGT: 11082 case ISD::SETGE: 11083 Opcode = X86ISD::FMIN; 11084 break; 11085 11086 case ISD::SETULT: 11087 // Converting this to a max would handle NaNs incorrectly. 11088 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11089 break; 11090 Opcode = X86ISD::FMAX; 11091 break; 11092 case ISD::SETOLE: 11093 // Converting this to a max would handle comparisons between positive 11094 // and negative zero incorrectly, and swapping the operands would 11095 // cause it to handle NaNs incorrectly. 11096 if (!UnsafeFPMath && 11097 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 11098 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11099 break; 11100 std::swap(LHS, RHS); 11101 } 11102 Opcode = X86ISD::FMAX; 11103 break; 11104 case ISD::SETULE: 11105 // Converting this to a max would handle both negative zeros and NaNs 11106 // incorrectly, but we can swap the operands to fix both. 11107 std::swap(LHS, RHS); 11108 case ISD::SETOLT: 11109 case ISD::SETLT: 11110 case ISD::SETLE: 11111 Opcode = X86ISD::FMAX; 11112 break; 11113 } 11114 } 11115 11116 if (Opcode) 11117 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 11118 } 11119 11120 // If this is a select between two integer constants, try to do some 11121 // optimizations. 11122 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 11123 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 11124 // Don't do this for crazy integer types. 11125 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 11126 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 11127 // so that TrueC (the true value) is larger than FalseC. 11128 bool NeedsCondInvert = false; 11129 11130 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 11131 // Efficiently invertible. 11132 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 11133 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 11134 isa<ConstantSDNode>(Cond.getOperand(1))))) { 11135 NeedsCondInvert = true; 11136 std::swap(TrueC, FalseC); 11137 } 11138 11139 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 11140 if (FalseC->getAPIntValue() == 0 && 11141 TrueC->getAPIntValue().isPowerOf2()) { 11142 if (NeedsCondInvert) // Invert the condition if needed. 11143 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11144 DAG.getConstant(1, Cond.getValueType())); 11145 11146 // Zero extend the condition if needed. 11147 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 11148 11149 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11150 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 11151 DAG.getConstant(ShAmt, MVT::i8)); 11152 } 11153 11154 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 11155 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11156 if (NeedsCondInvert) // Invert the condition if needed. 11157 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11158 DAG.getConstant(1, Cond.getValueType())); 11159 11160 // Zero extend the condition if needed. 11161 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11162 FalseC->getValueType(0), Cond); 11163 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11164 SDValue(FalseC, 0)); 11165 } 11166 11167 // Optimize cases that will turn into an LEA instruction. This requires 11168 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11169 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11170 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11171 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11172 11173 bool isFastMultiplier = false; 11174 if (Diff < 10) { 11175 switch ((unsigned char)Diff) { 11176 default: break; 11177 case 1: // result = add base, cond 11178 case 2: // result = lea base( , cond*2) 11179 case 3: // result = lea base(cond, cond*2) 11180 case 4: // result = lea base( , cond*4) 11181 case 5: // result = lea base(cond, cond*4) 11182 case 8: // result = lea base( , cond*8) 11183 case 9: // result = lea base(cond, cond*8) 11184 isFastMultiplier = true; 11185 break; 11186 } 11187 } 11188 11189 if (isFastMultiplier) { 11190 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11191 if (NeedsCondInvert) // Invert the condition if needed. 11192 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11193 DAG.getConstant(1, Cond.getValueType())); 11194 11195 // Zero extend the condition if needed. 11196 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11197 Cond); 11198 // Scale the condition by the difference. 11199 if (Diff != 1) 11200 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11201 DAG.getConstant(Diff, Cond.getValueType())); 11202 11203 // Add the base if non-zero. 11204 if (FalseC->getAPIntValue() != 0) 11205 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11206 SDValue(FalseC, 0)); 11207 return Cond; 11208 } 11209 } 11210 } 11211 } 11212 11213 return SDValue(); 11214} 11215 11216/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 11217static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 11218 TargetLowering::DAGCombinerInfo &DCI) { 11219 DebugLoc DL = N->getDebugLoc(); 11220 11221 // If the flag operand isn't dead, don't touch this CMOV. 11222 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 11223 return SDValue(); 11224 11225 // If this is a select between two integer constants, try to do some 11226 // optimizations. Note that the operands are ordered the opposite of SELECT 11227 // operands. 11228 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 11229 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 11230 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 11231 // larger than FalseC (the false value). 11232 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 11233 11234 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 11235 CC = X86::GetOppositeBranchCondition(CC); 11236 std::swap(TrueC, FalseC); 11237 } 11238 11239 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 11240 // This is efficient for any integer data type (including i8/i16) and 11241 // shift amount. 11242 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 11243 SDValue Cond = N->getOperand(3); 11244 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11245 DAG.getConstant(CC, MVT::i8), Cond); 11246 11247 // Zero extend the condition if needed. 11248 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 11249 11250 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11251 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 11252 DAG.getConstant(ShAmt, MVT::i8)); 11253 if (N->getNumValues() == 2) // Dead flag value? 11254 return DCI.CombineTo(N, Cond, SDValue()); 11255 return Cond; 11256 } 11257 11258 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 11259 // for any integer data type, including i8/i16. 11260 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11261 SDValue Cond = N->getOperand(3); 11262 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11263 DAG.getConstant(CC, MVT::i8), Cond); 11264 11265 // Zero extend the condition if needed. 11266 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11267 FalseC->getValueType(0), Cond); 11268 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11269 SDValue(FalseC, 0)); 11270 11271 if (N->getNumValues() == 2) // Dead flag value? 11272 return DCI.CombineTo(N, Cond, SDValue()); 11273 return Cond; 11274 } 11275 11276 // Optimize cases that will turn into an LEA instruction. This requires 11277 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11278 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11279 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11280 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11281 11282 bool isFastMultiplier = false; 11283 if (Diff < 10) { 11284 switch ((unsigned char)Diff) { 11285 default: break; 11286 case 1: // result = add base, cond 11287 case 2: // result = lea base( , cond*2) 11288 case 3: // result = lea base(cond, cond*2) 11289 case 4: // result = lea base( , cond*4) 11290 case 5: // result = lea base(cond, cond*4) 11291 case 8: // result = lea base( , cond*8) 11292 case 9: // result = lea base(cond, cond*8) 11293 isFastMultiplier = true; 11294 break; 11295 } 11296 } 11297 11298 if (isFastMultiplier) { 11299 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11300 SDValue Cond = N->getOperand(3); 11301 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11302 DAG.getConstant(CC, MVT::i8), Cond); 11303 // Zero extend the condition if needed. 11304 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11305 Cond); 11306 // Scale the condition by the difference. 11307 if (Diff != 1) 11308 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11309 DAG.getConstant(Diff, Cond.getValueType())); 11310 11311 // Add the base if non-zero. 11312 if (FalseC->getAPIntValue() != 0) 11313 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11314 SDValue(FalseC, 0)); 11315 if (N->getNumValues() == 2) // Dead flag value? 11316 return DCI.CombineTo(N, Cond, SDValue()); 11317 return Cond; 11318 } 11319 } 11320 } 11321 } 11322 return SDValue(); 11323} 11324 11325 11326/// PerformMulCombine - Optimize a single multiply with constant into two 11327/// in order to implement it with two cheaper instructions, e.g. 11328/// LEA + SHL, LEA + LEA. 11329static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 11330 TargetLowering::DAGCombinerInfo &DCI) { 11331 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11332 return SDValue(); 11333 11334 EVT VT = N->getValueType(0); 11335 if (VT != MVT::i64) 11336 return SDValue(); 11337 11338 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11339 if (!C) 11340 return SDValue(); 11341 uint64_t MulAmt = C->getZExtValue(); 11342 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 11343 return SDValue(); 11344 11345 uint64_t MulAmt1 = 0; 11346 uint64_t MulAmt2 = 0; 11347 if ((MulAmt % 9) == 0) { 11348 MulAmt1 = 9; 11349 MulAmt2 = MulAmt / 9; 11350 } else if ((MulAmt % 5) == 0) { 11351 MulAmt1 = 5; 11352 MulAmt2 = MulAmt / 5; 11353 } else if ((MulAmt % 3) == 0) { 11354 MulAmt1 = 3; 11355 MulAmt2 = MulAmt / 3; 11356 } 11357 if (MulAmt2 && 11358 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 11359 DebugLoc DL = N->getDebugLoc(); 11360 11361 if (isPowerOf2_64(MulAmt2) && 11362 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 11363 // If second multiplifer is pow2, issue it first. We want the multiply by 11364 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 11365 // is an add. 11366 std::swap(MulAmt1, MulAmt2); 11367 11368 SDValue NewMul; 11369 if (isPowerOf2_64(MulAmt1)) 11370 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 11371 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 11372 else 11373 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 11374 DAG.getConstant(MulAmt1, VT)); 11375 11376 if (isPowerOf2_64(MulAmt2)) 11377 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 11378 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 11379 else 11380 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 11381 DAG.getConstant(MulAmt2, VT)); 11382 11383 // Do not add new nodes to DAG combiner worklist. 11384 DCI.CombineTo(N, NewMul, false); 11385 } 11386 return SDValue(); 11387} 11388 11389static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 11390 SDValue N0 = N->getOperand(0); 11391 SDValue N1 = N->getOperand(1); 11392 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11393 EVT VT = N0.getValueType(); 11394 11395 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11396 // since the result of setcc_c is all zero's or all ones. 11397 if (N1C && N0.getOpcode() == ISD::AND && 11398 N0.getOperand(1).getOpcode() == ISD::Constant) { 11399 SDValue N00 = N0.getOperand(0); 11400 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11401 ((N00.getOpcode() == ISD::ANY_EXTEND || 11402 N00.getOpcode() == ISD::ZERO_EXTEND) && 11403 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11404 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11405 APInt ShAmt = N1C->getAPIntValue(); 11406 Mask = Mask.shl(ShAmt); 11407 if (Mask != 0) 11408 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11409 N00, DAG.getConstant(Mask, VT)); 11410 } 11411 } 11412 11413 return SDValue(); 11414} 11415 11416/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 11417/// when possible. 11418static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 11419 const X86Subtarget *Subtarget) { 11420 EVT VT = N->getValueType(0); 11421 if (!VT.isVector() && VT.isInteger() && 11422 N->getOpcode() == ISD::SHL) 11423 return PerformSHLCombine(N, DAG); 11424 11425 // On X86 with SSE2 support, we can transform this to a vector shift if 11426 // all elements are shifted by the same amount. We can't do this in legalize 11427 // because the a constant vector is typically transformed to a constant pool 11428 // so we have no knowledge of the shift amount. 11429 if (!Subtarget->hasSSE2()) 11430 return SDValue(); 11431 11432 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 11433 return SDValue(); 11434 11435 SDValue ShAmtOp = N->getOperand(1); 11436 EVT EltVT = VT.getVectorElementType(); 11437 DebugLoc DL = N->getDebugLoc(); 11438 SDValue BaseShAmt = SDValue(); 11439 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 11440 unsigned NumElts = VT.getVectorNumElements(); 11441 unsigned i = 0; 11442 for (; i != NumElts; ++i) { 11443 SDValue Arg = ShAmtOp.getOperand(i); 11444 if (Arg.getOpcode() == ISD::UNDEF) continue; 11445 BaseShAmt = Arg; 11446 break; 11447 } 11448 for (; i != NumElts; ++i) { 11449 SDValue Arg = ShAmtOp.getOperand(i); 11450 if (Arg.getOpcode() == ISD::UNDEF) continue; 11451 if (Arg != BaseShAmt) { 11452 return SDValue(); 11453 } 11454 } 11455 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 11456 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 11457 SDValue InVec = ShAmtOp.getOperand(0); 11458 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 11459 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 11460 unsigned i = 0; 11461 for (; i != NumElts; ++i) { 11462 SDValue Arg = InVec.getOperand(i); 11463 if (Arg.getOpcode() == ISD::UNDEF) continue; 11464 BaseShAmt = Arg; 11465 break; 11466 } 11467 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 11468 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 11469 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 11470 if (C->getZExtValue() == SplatIdx) 11471 BaseShAmt = InVec.getOperand(1); 11472 } 11473 } 11474 if (BaseShAmt.getNode() == 0) 11475 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 11476 DAG.getIntPtrConstant(0)); 11477 } else 11478 return SDValue(); 11479 11480 // The shift amount is an i32. 11481 if (EltVT.bitsGT(MVT::i32)) 11482 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 11483 else if (EltVT.bitsLT(MVT::i32)) 11484 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 11485 11486 // The shift amount is identical so we can do a vector shift. 11487 SDValue ValOp = N->getOperand(0); 11488 switch (N->getOpcode()) { 11489 default: 11490 llvm_unreachable("Unknown shift opcode!"); 11491 break; 11492 case ISD::SHL: 11493 if (VT == MVT::v2i64) 11494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11495 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11496 ValOp, BaseShAmt); 11497 if (VT == MVT::v4i32) 11498 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11499 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11500 ValOp, BaseShAmt); 11501 if (VT == MVT::v8i16) 11502 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11503 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11504 ValOp, BaseShAmt); 11505 break; 11506 case ISD::SRA: 11507 if (VT == MVT::v4i32) 11508 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11509 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11510 ValOp, BaseShAmt); 11511 if (VT == MVT::v8i16) 11512 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11513 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11514 ValOp, BaseShAmt); 11515 break; 11516 case ISD::SRL: 11517 if (VT == MVT::v2i64) 11518 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11519 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11520 ValOp, BaseShAmt); 11521 if (VT == MVT::v4i32) 11522 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11523 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11524 ValOp, BaseShAmt); 11525 if (VT == MVT::v8i16) 11526 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11527 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11528 ValOp, BaseShAmt); 11529 break; 11530 } 11531 return SDValue(); 11532} 11533 11534 11535static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 11536 TargetLowering::DAGCombinerInfo &DCI, 11537 const X86Subtarget *Subtarget) { 11538 if (DCI.isBeforeLegalizeOps()) 11539 return SDValue(); 11540 11541 // Want to form PANDN nodes, in the hopes of then easily combining them with 11542 // OR and AND nodes to form PBLEND/PSIGN. 11543 EVT VT = N->getValueType(0); 11544 if (VT != MVT::v2i64) 11545 return SDValue(); 11546 11547 SDValue N0 = N->getOperand(0); 11548 SDValue N1 = N->getOperand(1); 11549 DebugLoc DL = N->getDebugLoc(); 11550 11551 // Check LHS for vnot 11552 if (N0.getOpcode() == ISD::XOR && 11553 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 11554 return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); 11555 11556 // Check RHS for vnot 11557 if (N1.getOpcode() == ISD::XOR && 11558 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 11559 return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); 11560 11561 return SDValue(); 11562} 11563 11564static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 11565 TargetLowering::DAGCombinerInfo &DCI, 11566 const X86Subtarget *Subtarget) { 11567 if (DCI.isBeforeLegalizeOps()) 11568 return SDValue(); 11569 11570 EVT VT = N->getValueType(0); 11571 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 11572 return SDValue(); 11573 11574 SDValue N0 = N->getOperand(0); 11575 SDValue N1 = N->getOperand(1); 11576 11577 // look for psign/blend 11578 if (Subtarget->hasSSSE3()) { 11579 if (VT == MVT::v2i64) { 11580 // Canonicalize pandn to RHS 11581 if (N0.getOpcode() == X86ISD::PANDN) 11582 std::swap(N0, N1); 11583 // or (and (m, x), (pandn m, y)) 11584 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { 11585 SDValue Mask = N1.getOperand(0); 11586 SDValue X = N1.getOperand(1); 11587 SDValue Y; 11588 if (N0.getOperand(0) == Mask) 11589 Y = N0.getOperand(1); 11590 if (N0.getOperand(1) == Mask) 11591 Y = N0.getOperand(0); 11592 11593 // Check to see if the mask appeared in both the AND and PANDN and 11594 if (!Y.getNode()) 11595 return SDValue(); 11596 11597 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 11598 if (Mask.getOpcode() != ISD::BITCAST || 11599 X.getOpcode() != ISD::BITCAST || 11600 Y.getOpcode() != ISD::BITCAST) 11601 return SDValue(); 11602 11603 // Look through mask bitcast. 11604 Mask = Mask.getOperand(0); 11605 EVT MaskVT = Mask.getValueType(); 11606 11607 // Validate that the Mask operand is a vector sra node. The sra node 11608 // will be an intrinsic. 11609 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 11610 return SDValue(); 11611 11612 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 11613 // there is no psrai.b 11614 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 11615 case Intrinsic::x86_sse2_psrai_w: 11616 case Intrinsic::x86_sse2_psrai_d: 11617 break; 11618 default: return SDValue(); 11619 } 11620 11621 // Check that the SRA is all signbits. 11622 SDValue SraC = Mask.getOperand(2); 11623 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 11624 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 11625 if ((SraAmt + 1) != EltBits) 11626 return SDValue(); 11627 11628 DebugLoc DL = N->getDebugLoc(); 11629 11630 // Now we know we at least have a plendvb with the mask val. See if 11631 // we can form a psignb/w/d. 11632 // psign = x.type == y.type == mask.type && y = sub(0, x); 11633 X = X.getOperand(0); 11634 Y = Y.getOperand(0); 11635 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 11636 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 11637 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 11638 unsigned Opc = 0; 11639 switch (EltBits) { 11640 case 8: Opc = X86ISD::PSIGNB; break; 11641 case 16: Opc = X86ISD::PSIGNW; break; 11642 case 32: Opc = X86ISD::PSIGND; break; 11643 default: break; 11644 } 11645 if (Opc) { 11646 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 11647 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 11648 } 11649 } 11650 // PBLENDVB only available on SSE 4.1 11651 if (!Subtarget->hasSSE41()) 11652 return SDValue(); 11653 11654 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 11655 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 11656 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 11657 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 11658 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 11659 } 11660 } 11661 } 11662 11663 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 11664 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 11665 std::swap(N0, N1); 11666 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 11667 return SDValue(); 11668 if (!N0.hasOneUse() || !N1.hasOneUse()) 11669 return SDValue(); 11670 11671 SDValue ShAmt0 = N0.getOperand(1); 11672 if (ShAmt0.getValueType() != MVT::i8) 11673 return SDValue(); 11674 SDValue ShAmt1 = N1.getOperand(1); 11675 if (ShAmt1.getValueType() != MVT::i8) 11676 return SDValue(); 11677 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11678 ShAmt0 = ShAmt0.getOperand(0); 11679 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11680 ShAmt1 = ShAmt1.getOperand(0); 11681 11682 DebugLoc DL = N->getDebugLoc(); 11683 unsigned Opc = X86ISD::SHLD; 11684 SDValue Op0 = N0.getOperand(0); 11685 SDValue Op1 = N1.getOperand(0); 11686 if (ShAmt0.getOpcode() == ISD::SUB) { 11687 Opc = X86ISD::SHRD; 11688 std::swap(Op0, Op1); 11689 std::swap(ShAmt0, ShAmt1); 11690 } 11691 11692 unsigned Bits = VT.getSizeInBits(); 11693 if (ShAmt1.getOpcode() == ISD::SUB) { 11694 SDValue Sum = ShAmt1.getOperand(0); 11695 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11696 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11697 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11698 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11699 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11700 return DAG.getNode(Opc, DL, VT, 11701 Op0, Op1, 11702 DAG.getNode(ISD::TRUNCATE, DL, 11703 MVT::i8, ShAmt0)); 11704 } 11705 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11706 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11707 if (ShAmt0C && 11708 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11709 return DAG.getNode(Opc, DL, VT, 11710 N0.getOperand(0), N1.getOperand(0), 11711 DAG.getNode(ISD::TRUNCATE, DL, 11712 MVT::i8, ShAmt0)); 11713 } 11714 11715 return SDValue(); 11716} 11717 11718/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11719static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11720 const X86Subtarget *Subtarget) { 11721 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11722 // the FP state in cases where an emms may be missing. 11723 // A preferable solution to the general problem is to figure out the right 11724 // places to insert EMMS. This qualifies as a quick hack. 11725 11726 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11727 StoreSDNode *St = cast<StoreSDNode>(N); 11728 EVT VT = St->getValue().getValueType(); 11729 if (VT.getSizeInBits() != 64) 11730 return SDValue(); 11731 11732 const Function *F = DAG.getMachineFunction().getFunction(); 11733 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11734 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11735 && Subtarget->hasSSE2(); 11736 if ((VT.isVector() || 11737 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11738 isa<LoadSDNode>(St->getValue()) && 11739 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11740 St->getChain().hasOneUse() && !St->isVolatile()) { 11741 SDNode* LdVal = St->getValue().getNode(); 11742 LoadSDNode *Ld = 0; 11743 int TokenFactorIndex = -1; 11744 SmallVector<SDValue, 8> Ops; 11745 SDNode* ChainVal = St->getChain().getNode(); 11746 // Must be a store of a load. We currently handle two cases: the load 11747 // is a direct child, and it's under an intervening TokenFactor. It is 11748 // possible to dig deeper under nested TokenFactors. 11749 if (ChainVal == LdVal) 11750 Ld = cast<LoadSDNode>(St->getChain()); 11751 else if (St->getValue().hasOneUse() && 11752 ChainVal->getOpcode() == ISD::TokenFactor) { 11753 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11754 if (ChainVal->getOperand(i).getNode() == LdVal) { 11755 TokenFactorIndex = i; 11756 Ld = cast<LoadSDNode>(St->getValue()); 11757 } else 11758 Ops.push_back(ChainVal->getOperand(i)); 11759 } 11760 } 11761 11762 if (!Ld || !ISD::isNormalLoad(Ld)) 11763 return SDValue(); 11764 11765 // If this is not the MMX case, i.e. we are just turning i64 load/store 11766 // into f64 load/store, avoid the transformation if there are multiple 11767 // uses of the loaded value. 11768 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11769 return SDValue(); 11770 11771 DebugLoc LdDL = Ld->getDebugLoc(); 11772 DebugLoc StDL = N->getDebugLoc(); 11773 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11774 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11775 // pair instead. 11776 if (Subtarget->is64Bit() || F64IsLegal) { 11777 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11778 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11779 Ld->getPointerInfo(), Ld->isVolatile(), 11780 Ld->isNonTemporal(), Ld->getAlignment()); 11781 SDValue NewChain = NewLd.getValue(1); 11782 if (TokenFactorIndex != -1) { 11783 Ops.push_back(NewChain); 11784 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11785 Ops.size()); 11786 } 11787 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11788 St->getPointerInfo(), 11789 St->isVolatile(), St->isNonTemporal(), 11790 St->getAlignment()); 11791 } 11792 11793 // Otherwise, lower to two pairs of 32-bit loads / stores. 11794 SDValue LoAddr = Ld->getBasePtr(); 11795 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11796 DAG.getConstant(4, MVT::i32)); 11797 11798 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11799 Ld->getPointerInfo(), 11800 Ld->isVolatile(), Ld->isNonTemporal(), 11801 Ld->getAlignment()); 11802 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11803 Ld->getPointerInfo().getWithOffset(4), 11804 Ld->isVolatile(), Ld->isNonTemporal(), 11805 MinAlign(Ld->getAlignment(), 4)); 11806 11807 SDValue NewChain = LoLd.getValue(1); 11808 if (TokenFactorIndex != -1) { 11809 Ops.push_back(LoLd); 11810 Ops.push_back(HiLd); 11811 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11812 Ops.size()); 11813 } 11814 11815 LoAddr = St->getBasePtr(); 11816 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11817 DAG.getConstant(4, MVT::i32)); 11818 11819 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11820 St->getPointerInfo(), 11821 St->isVolatile(), St->isNonTemporal(), 11822 St->getAlignment()); 11823 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11824 St->getPointerInfo().getWithOffset(4), 11825 St->isVolatile(), 11826 St->isNonTemporal(), 11827 MinAlign(St->getAlignment(), 4)); 11828 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11829 } 11830 return SDValue(); 11831} 11832 11833/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11834/// X86ISD::FXOR nodes. 11835static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11836 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11837 // F[X]OR(0.0, x) -> x 11838 // F[X]OR(x, 0.0) -> x 11839 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11840 if (C->getValueAPF().isPosZero()) 11841 return N->getOperand(1); 11842 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11843 if (C->getValueAPF().isPosZero()) 11844 return N->getOperand(0); 11845 return SDValue(); 11846} 11847 11848/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11849static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11850 // FAND(0.0, x) -> 0.0 11851 // FAND(x, 0.0) -> 0.0 11852 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11853 if (C->getValueAPF().isPosZero()) 11854 return N->getOperand(0); 11855 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11856 if (C->getValueAPF().isPosZero()) 11857 return N->getOperand(1); 11858 return SDValue(); 11859} 11860 11861static SDValue PerformBTCombine(SDNode *N, 11862 SelectionDAG &DAG, 11863 TargetLowering::DAGCombinerInfo &DCI) { 11864 // BT ignores high bits in the bit index operand. 11865 SDValue Op1 = N->getOperand(1); 11866 if (Op1.hasOneUse()) { 11867 unsigned BitWidth = Op1.getValueSizeInBits(); 11868 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11869 APInt KnownZero, KnownOne; 11870 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11871 !DCI.isBeforeLegalizeOps()); 11872 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11873 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11874 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11875 DCI.CommitTargetLoweringOpt(TLO); 11876 } 11877 return SDValue(); 11878} 11879 11880static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11881 SDValue Op = N->getOperand(0); 11882 if (Op.getOpcode() == ISD::BITCAST) 11883 Op = Op.getOperand(0); 11884 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11885 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11886 VT.getVectorElementType().getSizeInBits() == 11887 OpVT.getVectorElementType().getSizeInBits()) { 11888 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 11889 } 11890 return SDValue(); 11891} 11892 11893static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11894 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11895 // (and (i32 x86isd::setcc_carry), 1) 11896 // This eliminates the zext. This transformation is necessary because 11897 // ISD::SETCC is always legalized to i8. 11898 DebugLoc dl = N->getDebugLoc(); 11899 SDValue N0 = N->getOperand(0); 11900 EVT VT = N->getValueType(0); 11901 if (N0.getOpcode() == ISD::AND && 11902 N0.hasOneUse() && 11903 N0.getOperand(0).hasOneUse()) { 11904 SDValue N00 = N0.getOperand(0); 11905 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11906 return SDValue(); 11907 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11908 if (!C || C->getZExtValue() != 1) 11909 return SDValue(); 11910 return DAG.getNode(ISD::AND, dl, VT, 11911 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11912 N00.getOperand(0), N00.getOperand(1)), 11913 DAG.getConstant(1, VT)); 11914 } 11915 11916 return SDValue(); 11917} 11918 11919// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 11920static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 11921 unsigned X86CC = N->getConstantOperandVal(0); 11922 SDValue EFLAG = N->getOperand(1); 11923 DebugLoc DL = N->getDebugLoc(); 11924 11925 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 11926 // a zext and produces an all-ones bit which is more useful than 0/1 in some 11927 // cases. 11928 if (X86CC == X86::COND_B) 11929 return DAG.getNode(ISD::AND, DL, MVT::i8, 11930 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 11931 DAG.getConstant(X86CC, MVT::i8), EFLAG), 11932 DAG.getConstant(1, MVT::i8)); 11933 11934 return SDValue(); 11935} 11936 11937// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 11938static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 11939 X86TargetLowering::DAGCombinerInfo &DCI) { 11940 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 11941 // the result is either zero or one (depending on the input carry bit). 11942 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 11943 if (X86::isZeroNode(N->getOperand(0)) && 11944 X86::isZeroNode(N->getOperand(1)) && 11945 // We don't have a good way to replace an EFLAGS use, so only do this when 11946 // dead right now. 11947 SDValue(N, 1).use_empty()) { 11948 DebugLoc DL = N->getDebugLoc(); 11949 EVT VT = N->getValueType(0); 11950 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 11951 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 11952 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 11953 DAG.getConstant(X86::COND_B,MVT::i8), 11954 N->getOperand(2)), 11955 DAG.getConstant(1, VT)); 11956 return DCI.CombineTo(N, Res1, CarryOut); 11957 } 11958 11959 return SDValue(); 11960} 11961 11962// fold (add Y, (sete X, 0)) -> adc 0, Y 11963// (add Y, (setne X, 0)) -> sbb -1, Y 11964// (sub (sete X, 0), Y) -> sbb 0, Y 11965// (sub (setne X, 0), Y) -> adc -1, Y 11966static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) { 11967 DebugLoc DL = N->getDebugLoc(); 11968 11969 // Look through ZExts. 11970 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 11971 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 11972 return SDValue(); 11973 11974 SDValue SetCC = Ext.getOperand(0); 11975 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 11976 return SDValue(); 11977 11978 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 11979 if (CC != X86::COND_E && CC != X86::COND_NE) 11980 return SDValue(); 11981 11982 SDValue Cmp = SetCC.getOperand(1); 11983 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 11984 !X86::isZeroNode(Cmp.getOperand(1)) || 11985 !Cmp.getOperand(0).getValueType().isInteger()) 11986 return SDValue(); 11987 11988 SDValue CmpOp0 = Cmp.getOperand(0); 11989 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 11990 DAG.getConstant(1, CmpOp0.getValueType())); 11991 11992 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 11993 if (CC == X86::COND_NE) 11994 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 11995 DL, OtherVal.getValueType(), OtherVal, 11996 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 11997 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 11998 DL, OtherVal.getValueType(), OtherVal, 11999 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 12000} 12001 12002SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 12003 DAGCombinerInfo &DCI) const { 12004 SelectionDAG &DAG = DCI.DAG; 12005 switch (N->getOpcode()) { 12006 default: break; 12007 case ISD::EXTRACT_VECTOR_ELT: 12008 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 12009 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 12010 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 12011 case ISD::ADD: 12012 case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG); 12013 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 12014 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 12015 case ISD::SHL: 12016 case ISD::SRA: 12017 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 12018 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 12019 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 12020 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 12021 case X86ISD::FXOR: 12022 case X86ISD::FOR: return PerformFORCombine(N, DAG); 12023 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 12024 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 12025 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 12026 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 12027 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 12028 case X86ISD::SHUFPS: // Handle all target specific shuffles 12029 case X86ISD::SHUFPD: 12030 case X86ISD::PALIGN: 12031 case X86ISD::PUNPCKHBW: 12032 case X86ISD::PUNPCKHWD: 12033 case X86ISD::PUNPCKHDQ: 12034 case X86ISD::PUNPCKHQDQ: 12035 case X86ISD::UNPCKHPS: 12036 case X86ISD::UNPCKHPD: 12037 case X86ISD::PUNPCKLBW: 12038 case X86ISD::PUNPCKLWD: 12039 case X86ISD::PUNPCKLDQ: 12040 case X86ISD::PUNPCKLQDQ: 12041 case X86ISD::UNPCKLPS: 12042 case X86ISD::UNPCKLPD: 12043 case X86ISD::VUNPCKLPS: 12044 case X86ISD::VUNPCKLPD: 12045 case X86ISD::VUNPCKLPSY: 12046 case X86ISD::VUNPCKLPDY: 12047 case X86ISD::MOVHLPS: 12048 case X86ISD::MOVLHPS: 12049 case X86ISD::PSHUFD: 12050 case X86ISD::PSHUFHW: 12051 case X86ISD::PSHUFLW: 12052 case X86ISD::MOVSS: 12053 case X86ISD::MOVSD: 12054 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 12055 } 12056 12057 return SDValue(); 12058} 12059 12060/// isTypeDesirableForOp - Return true if the target has native support for 12061/// the specified value type and it is 'desirable' to use the type for the 12062/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 12063/// instruction encodings are longer and some i16 instructions are slow. 12064bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 12065 if (!isTypeLegal(VT)) 12066 return false; 12067 if (VT != MVT::i16) 12068 return true; 12069 12070 switch (Opc) { 12071 default: 12072 return true; 12073 case ISD::LOAD: 12074 case ISD::SIGN_EXTEND: 12075 case ISD::ZERO_EXTEND: 12076 case ISD::ANY_EXTEND: 12077 case ISD::SHL: 12078 case ISD::SRL: 12079 case ISD::SUB: 12080 case ISD::ADD: 12081 case ISD::MUL: 12082 case ISD::AND: 12083 case ISD::OR: 12084 case ISD::XOR: 12085 return false; 12086 } 12087} 12088 12089/// IsDesirableToPromoteOp - This method query the target whether it is 12090/// beneficial for dag combiner to promote the specified node. If true, it 12091/// should return the desired promotion type by reference. 12092bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 12093 EVT VT = Op.getValueType(); 12094 if (VT != MVT::i16) 12095 return false; 12096 12097 bool Promote = false; 12098 bool Commute = false; 12099 switch (Op.getOpcode()) { 12100 default: break; 12101 case ISD::LOAD: { 12102 LoadSDNode *LD = cast<LoadSDNode>(Op); 12103 // If the non-extending load has a single use and it's not live out, then it 12104 // might be folded. 12105 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 12106 Op.hasOneUse()*/) { 12107 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12108 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 12109 // The only case where we'd want to promote LOAD (rather then it being 12110 // promoted as an operand is when it's only use is liveout. 12111 if (UI->getOpcode() != ISD::CopyToReg) 12112 return false; 12113 } 12114 } 12115 Promote = true; 12116 break; 12117 } 12118 case ISD::SIGN_EXTEND: 12119 case ISD::ZERO_EXTEND: 12120 case ISD::ANY_EXTEND: 12121 Promote = true; 12122 break; 12123 case ISD::SHL: 12124 case ISD::SRL: { 12125 SDValue N0 = Op.getOperand(0); 12126 // Look out for (store (shl (load), x)). 12127 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 12128 return false; 12129 Promote = true; 12130 break; 12131 } 12132 case ISD::ADD: 12133 case ISD::MUL: 12134 case ISD::AND: 12135 case ISD::OR: 12136 case ISD::XOR: 12137 Commute = true; 12138 // fallthrough 12139 case ISD::SUB: { 12140 SDValue N0 = Op.getOperand(0); 12141 SDValue N1 = Op.getOperand(1); 12142 if (!Commute && MayFoldLoad(N1)) 12143 return false; 12144 // Avoid disabling potential load folding opportunities. 12145 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 12146 return false; 12147 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 12148 return false; 12149 Promote = true; 12150 } 12151 } 12152 12153 PVT = MVT::i32; 12154 return Promote; 12155} 12156 12157//===----------------------------------------------------------------------===// 12158// X86 Inline Assembly Support 12159//===----------------------------------------------------------------------===// 12160 12161bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 12162 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 12163 12164 std::string AsmStr = IA->getAsmString(); 12165 12166 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 12167 SmallVector<StringRef, 4> AsmPieces; 12168 SplitString(AsmStr, AsmPieces, ";\n"); 12169 12170 switch (AsmPieces.size()) { 12171 default: return false; 12172 case 1: 12173 AsmStr = AsmPieces[0]; 12174 AsmPieces.clear(); 12175 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 12176 12177 // FIXME: this should verify that we are targetting a 486 or better. If not, 12178 // we will turn this bswap into something that will be lowered to logical ops 12179 // instead of emitting the bswap asm. For now, we don't support 486 or lower 12180 // so don't worry about this. 12181 // bswap $0 12182 if (AsmPieces.size() == 2 && 12183 (AsmPieces[0] == "bswap" || 12184 AsmPieces[0] == "bswapq" || 12185 AsmPieces[0] == "bswapl") && 12186 (AsmPieces[1] == "$0" || 12187 AsmPieces[1] == "${0:q}")) { 12188 // No need to check constraints, nothing other than the equivalent of 12189 // "=r,0" would be valid here. 12190 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12191 if (!Ty || Ty->getBitWidth() % 16 != 0) 12192 return false; 12193 return IntrinsicLowering::LowerToByteSwap(CI); 12194 } 12195 // rorw $$8, ${0:w} --> llvm.bswap.i16 12196 if (CI->getType()->isIntegerTy(16) && 12197 AsmPieces.size() == 3 && 12198 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 12199 AsmPieces[1] == "$$8," && 12200 AsmPieces[2] == "${0:w}" && 12201 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12202 AsmPieces.clear(); 12203 const std::string &ConstraintsStr = IA->getConstraintString(); 12204 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12205 std::sort(AsmPieces.begin(), AsmPieces.end()); 12206 if (AsmPieces.size() == 4 && 12207 AsmPieces[0] == "~{cc}" && 12208 AsmPieces[1] == "~{dirflag}" && 12209 AsmPieces[2] == "~{flags}" && 12210 AsmPieces[3] == "~{fpsr}") { 12211 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12212 if (!Ty || Ty->getBitWidth() % 16 != 0) 12213 return false; 12214 return IntrinsicLowering::LowerToByteSwap(CI); 12215 } 12216 } 12217 break; 12218 case 3: 12219 if (CI->getType()->isIntegerTy(32) && 12220 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12221 SmallVector<StringRef, 4> Words; 12222 SplitString(AsmPieces[0], Words, " \t,"); 12223 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12224 Words[2] == "${0:w}") { 12225 Words.clear(); 12226 SplitString(AsmPieces[1], Words, " \t,"); 12227 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 12228 Words[2] == "$0") { 12229 Words.clear(); 12230 SplitString(AsmPieces[2], Words, " \t,"); 12231 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12232 Words[2] == "${0:w}") { 12233 AsmPieces.clear(); 12234 const std::string &ConstraintsStr = IA->getConstraintString(); 12235 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12236 std::sort(AsmPieces.begin(), AsmPieces.end()); 12237 if (AsmPieces.size() == 4 && 12238 AsmPieces[0] == "~{cc}" && 12239 AsmPieces[1] == "~{dirflag}" && 12240 AsmPieces[2] == "~{flags}" && 12241 AsmPieces[3] == "~{fpsr}") { 12242 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12243 if (!Ty || Ty->getBitWidth() % 16 != 0) 12244 return false; 12245 return IntrinsicLowering::LowerToByteSwap(CI); 12246 } 12247 } 12248 } 12249 } 12250 } 12251 12252 if (CI->getType()->isIntegerTy(64)) { 12253 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 12254 if (Constraints.size() >= 2 && 12255 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 12256 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 12257 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 12258 SmallVector<StringRef, 4> Words; 12259 SplitString(AsmPieces[0], Words, " \t"); 12260 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 12261 Words.clear(); 12262 SplitString(AsmPieces[1], Words, " \t"); 12263 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 12264 Words.clear(); 12265 SplitString(AsmPieces[2], Words, " \t,"); 12266 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 12267 Words[2] == "%edx") { 12268 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12269 if (!Ty || Ty->getBitWidth() % 16 != 0) 12270 return false; 12271 return IntrinsicLowering::LowerToByteSwap(CI); 12272 } 12273 } 12274 } 12275 } 12276 } 12277 break; 12278 } 12279 return false; 12280} 12281 12282 12283 12284/// getConstraintType - Given a constraint letter, return the type of 12285/// constraint it is for this target. 12286X86TargetLowering::ConstraintType 12287X86TargetLowering::getConstraintType(const std::string &Constraint) const { 12288 if (Constraint.size() == 1) { 12289 switch (Constraint[0]) { 12290 case 'R': 12291 case 'q': 12292 case 'Q': 12293 case 'f': 12294 case 't': 12295 case 'u': 12296 case 'y': 12297 case 'x': 12298 case 'Y': 12299 return C_RegisterClass; 12300 case 'a': 12301 case 'b': 12302 case 'c': 12303 case 'd': 12304 case 'S': 12305 case 'D': 12306 case 'A': 12307 return C_Register; 12308 case 'I': 12309 case 'J': 12310 case 'K': 12311 case 'L': 12312 case 'M': 12313 case 'N': 12314 case 'G': 12315 case 'C': 12316 case 'e': 12317 case 'Z': 12318 return C_Other; 12319 default: 12320 break; 12321 } 12322 } 12323 return TargetLowering::getConstraintType(Constraint); 12324} 12325 12326/// Examine constraint type and operand type and determine a weight value. 12327/// This object must already have been set up with the operand type 12328/// and the current alternative constraint selected. 12329TargetLowering::ConstraintWeight 12330 X86TargetLowering::getSingleConstraintMatchWeight( 12331 AsmOperandInfo &info, const char *constraint) const { 12332 ConstraintWeight weight = CW_Invalid; 12333 Value *CallOperandVal = info.CallOperandVal; 12334 // If we don't have a value, we can't do a match, 12335 // but allow it at the lowest weight. 12336 if (CallOperandVal == NULL) 12337 return CW_Default; 12338 const Type *type = CallOperandVal->getType(); 12339 // Look at the constraint type. 12340 switch (*constraint) { 12341 default: 12342 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12343 case 'R': 12344 case 'q': 12345 case 'Q': 12346 case 'a': 12347 case 'b': 12348 case 'c': 12349 case 'd': 12350 case 'S': 12351 case 'D': 12352 case 'A': 12353 if (CallOperandVal->getType()->isIntegerTy()) 12354 weight = CW_SpecificReg; 12355 break; 12356 case 'f': 12357 case 't': 12358 case 'u': 12359 if (type->isFloatingPointTy()) 12360 weight = CW_SpecificReg; 12361 break; 12362 case 'y': 12363 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 12364 weight = CW_SpecificReg; 12365 break; 12366 case 'x': 12367 case 'Y': 12368 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 12369 weight = CW_Register; 12370 break; 12371 case 'I': 12372 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 12373 if (C->getZExtValue() <= 31) 12374 weight = CW_Constant; 12375 } 12376 break; 12377 case 'J': 12378 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12379 if (C->getZExtValue() <= 63) 12380 weight = CW_Constant; 12381 } 12382 break; 12383 case 'K': 12384 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12385 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 12386 weight = CW_Constant; 12387 } 12388 break; 12389 case 'L': 12390 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12391 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 12392 weight = CW_Constant; 12393 } 12394 break; 12395 case 'M': 12396 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12397 if (C->getZExtValue() <= 3) 12398 weight = CW_Constant; 12399 } 12400 break; 12401 case 'N': 12402 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12403 if (C->getZExtValue() <= 0xff) 12404 weight = CW_Constant; 12405 } 12406 break; 12407 case 'G': 12408 case 'C': 12409 if (dyn_cast<ConstantFP>(CallOperandVal)) { 12410 weight = CW_Constant; 12411 } 12412 break; 12413 case 'e': 12414 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12415 if ((C->getSExtValue() >= -0x80000000LL) && 12416 (C->getSExtValue() <= 0x7fffffffLL)) 12417 weight = CW_Constant; 12418 } 12419 break; 12420 case 'Z': 12421 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12422 if (C->getZExtValue() <= 0xffffffff) 12423 weight = CW_Constant; 12424 } 12425 break; 12426 } 12427 return weight; 12428} 12429 12430/// LowerXConstraint - try to replace an X constraint, which matches anything, 12431/// with another that has more specific requirements based on the type of the 12432/// corresponding operand. 12433const char *X86TargetLowering:: 12434LowerXConstraint(EVT ConstraintVT) const { 12435 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 12436 // 'f' like normal targets. 12437 if (ConstraintVT.isFloatingPoint()) { 12438 if (Subtarget->hasXMMInt()) 12439 return "Y"; 12440 if (Subtarget->hasXMM()) 12441 return "x"; 12442 } 12443 12444 return TargetLowering::LowerXConstraint(ConstraintVT); 12445} 12446 12447/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12448/// vector. If it is invalid, don't add anything to Ops. 12449void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12450 char Constraint, 12451 std::vector<SDValue>&Ops, 12452 SelectionDAG &DAG) const { 12453 SDValue Result(0, 0); 12454 12455 switch (Constraint) { 12456 default: break; 12457 case 'I': 12458 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12459 if (C->getZExtValue() <= 31) { 12460 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12461 break; 12462 } 12463 } 12464 return; 12465 case 'J': 12466 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12467 if (C->getZExtValue() <= 63) { 12468 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12469 break; 12470 } 12471 } 12472 return; 12473 case 'K': 12474 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12475 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 12476 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12477 break; 12478 } 12479 } 12480 return; 12481 case 'N': 12482 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12483 if (C->getZExtValue() <= 255) { 12484 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12485 break; 12486 } 12487 } 12488 return; 12489 case 'e': { 12490 // 32-bit signed value 12491 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12492 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12493 C->getSExtValue())) { 12494 // Widen to 64 bits here to get it sign extended. 12495 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 12496 break; 12497 } 12498 // FIXME gcc accepts some relocatable values here too, but only in certain 12499 // memory models; it's complicated. 12500 } 12501 return; 12502 } 12503 case 'Z': { 12504 // 32-bit unsigned value 12505 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12506 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12507 C->getZExtValue())) { 12508 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12509 break; 12510 } 12511 } 12512 // FIXME gcc accepts some relocatable values here too, but only in certain 12513 // memory models; it's complicated. 12514 return; 12515 } 12516 case 'i': { 12517 // Literal immediates are always ok. 12518 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 12519 // Widen to 64 bits here to get it sign extended. 12520 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 12521 break; 12522 } 12523 12524 // In any sort of PIC mode addresses need to be computed at runtime by 12525 // adding in a register or some sort of table lookup. These can't 12526 // be used as immediates. 12527 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 12528 return; 12529 12530 // If we are in non-pic codegen mode, we allow the address of a global (with 12531 // an optional displacement) to be used with 'i'. 12532 GlobalAddressSDNode *GA = 0; 12533 int64_t Offset = 0; 12534 12535 // Match either (GA), (GA+C), (GA+C1+C2), etc. 12536 while (1) { 12537 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 12538 Offset += GA->getOffset(); 12539 break; 12540 } else if (Op.getOpcode() == ISD::ADD) { 12541 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12542 Offset += C->getZExtValue(); 12543 Op = Op.getOperand(0); 12544 continue; 12545 } 12546 } else if (Op.getOpcode() == ISD::SUB) { 12547 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12548 Offset += -C->getZExtValue(); 12549 Op = Op.getOperand(0); 12550 continue; 12551 } 12552 } 12553 12554 // Otherwise, this isn't something we can handle, reject it. 12555 return; 12556 } 12557 12558 const GlobalValue *GV = GA->getGlobal(); 12559 // If we require an extra load to get this address, as in PIC mode, we 12560 // can't accept it. 12561 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 12562 getTargetMachine()))) 12563 return; 12564 12565 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 12566 GA->getValueType(0), Offset); 12567 break; 12568 } 12569 } 12570 12571 if (Result.getNode()) { 12572 Ops.push_back(Result); 12573 return; 12574 } 12575 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12576} 12577 12578std::vector<unsigned> X86TargetLowering:: 12579getRegClassForInlineAsmConstraint(const std::string &Constraint, 12580 EVT VT) const { 12581 if (Constraint.size() == 1) { 12582 // FIXME: not handling fp-stack yet! 12583 switch (Constraint[0]) { // GCC X86 Constraint Letters 12584 default: break; // Unknown constraint letter 12585 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 12586 if (Subtarget->is64Bit()) { 12587 if (VT == MVT::i32) 12588 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 12589 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 12590 X86::R10D,X86::R11D,X86::R12D, 12591 X86::R13D,X86::R14D,X86::R15D, 12592 X86::EBP, X86::ESP, 0); 12593 else if (VT == MVT::i16) 12594 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 12595 X86::SI, X86::DI, X86::R8W,X86::R9W, 12596 X86::R10W,X86::R11W,X86::R12W, 12597 X86::R13W,X86::R14W,X86::R15W, 12598 X86::BP, X86::SP, 0); 12599 else if (VT == MVT::i8) 12600 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 12601 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 12602 X86::R10B,X86::R11B,X86::R12B, 12603 X86::R13B,X86::R14B,X86::R15B, 12604 X86::BPL, X86::SPL, 0); 12605 12606 else if (VT == MVT::i64) 12607 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 12608 X86::RSI, X86::RDI, X86::R8, X86::R9, 12609 X86::R10, X86::R11, X86::R12, 12610 X86::R13, X86::R14, X86::R15, 12611 X86::RBP, X86::RSP, 0); 12612 12613 break; 12614 } 12615 // 32-bit fallthrough 12616 case 'Q': // Q_REGS 12617 if (VT == MVT::i32) 12618 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 12619 else if (VT == MVT::i16) 12620 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 12621 else if (VT == MVT::i8) 12622 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 12623 else if (VT == MVT::i64) 12624 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 12625 break; 12626 } 12627 } 12628 12629 return std::vector<unsigned>(); 12630} 12631 12632std::pair<unsigned, const TargetRegisterClass*> 12633X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 12634 EVT VT) const { 12635 // First, see if this is a constraint that directly corresponds to an LLVM 12636 // register class. 12637 if (Constraint.size() == 1) { 12638 // GCC Constraint Letters 12639 switch (Constraint[0]) { 12640 default: break; 12641 case 'r': // GENERAL_REGS 12642 case 'l': // INDEX_REGS 12643 if (VT == MVT::i8) 12644 return std::make_pair(0U, X86::GR8RegisterClass); 12645 if (VT == MVT::i16) 12646 return std::make_pair(0U, X86::GR16RegisterClass); 12647 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12648 return std::make_pair(0U, X86::GR32RegisterClass); 12649 return std::make_pair(0U, X86::GR64RegisterClass); 12650 case 'R': // LEGACY_REGS 12651 if (VT == MVT::i8) 12652 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 12653 if (VT == MVT::i16) 12654 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 12655 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12656 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 12657 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 12658 case 'f': // FP Stack registers. 12659 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 12660 // value to the correct fpstack register class. 12661 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 12662 return std::make_pair(0U, X86::RFP32RegisterClass); 12663 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 12664 return std::make_pair(0U, X86::RFP64RegisterClass); 12665 return std::make_pair(0U, X86::RFP80RegisterClass); 12666 case 'y': // MMX_REGS if MMX allowed. 12667 if (!Subtarget->hasMMX()) break; 12668 return std::make_pair(0U, X86::VR64RegisterClass); 12669 case 'Y': // SSE_REGS if SSE2 allowed 12670 if (!Subtarget->hasXMMInt()) break; 12671 // FALL THROUGH. 12672 case 'x': // SSE_REGS if SSE1 allowed 12673 if (!Subtarget->hasXMM()) break; 12674 12675 switch (VT.getSimpleVT().SimpleTy) { 12676 default: break; 12677 // Scalar SSE types. 12678 case MVT::f32: 12679 case MVT::i32: 12680 return std::make_pair(0U, X86::FR32RegisterClass); 12681 case MVT::f64: 12682 case MVT::i64: 12683 return std::make_pair(0U, X86::FR64RegisterClass); 12684 // Vector types. 12685 case MVT::v16i8: 12686 case MVT::v8i16: 12687 case MVT::v4i32: 12688 case MVT::v2i64: 12689 case MVT::v4f32: 12690 case MVT::v2f64: 12691 return std::make_pair(0U, X86::VR128RegisterClass); 12692 } 12693 break; 12694 } 12695 } 12696 12697 // Use the default implementation in TargetLowering to convert the register 12698 // constraint into a member of a register class. 12699 std::pair<unsigned, const TargetRegisterClass*> Res; 12700 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 12701 12702 // Not found as a standard register? 12703 if (Res.second == 0) { 12704 // Map st(0) -> st(7) -> ST0 12705 if (Constraint.size() == 7 && Constraint[0] == '{' && 12706 tolower(Constraint[1]) == 's' && 12707 tolower(Constraint[2]) == 't' && 12708 Constraint[3] == '(' && 12709 (Constraint[4] >= '0' && Constraint[4] <= '7') && 12710 Constraint[5] == ')' && 12711 Constraint[6] == '}') { 12712 12713 Res.first = X86::ST0+Constraint[4]-'0'; 12714 Res.second = X86::RFP80RegisterClass; 12715 return Res; 12716 } 12717 12718 // GCC allows "st(0)" to be called just plain "st". 12719 if (StringRef("{st}").equals_lower(Constraint)) { 12720 Res.first = X86::ST0; 12721 Res.second = X86::RFP80RegisterClass; 12722 return Res; 12723 } 12724 12725 // flags -> EFLAGS 12726 if (StringRef("{flags}").equals_lower(Constraint)) { 12727 Res.first = X86::EFLAGS; 12728 Res.second = X86::CCRRegisterClass; 12729 return Res; 12730 } 12731 12732 // 'A' means EAX + EDX. 12733 if (Constraint == "A") { 12734 Res.first = X86::EAX; 12735 Res.second = X86::GR32_ADRegisterClass; 12736 return Res; 12737 } 12738 return Res; 12739 } 12740 12741 // Otherwise, check to see if this is a register class of the wrong value 12742 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 12743 // turn into {ax},{dx}. 12744 if (Res.second->hasType(VT)) 12745 return Res; // Correct type already, nothing to do. 12746 12747 // All of the single-register GCC register classes map their values onto 12748 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 12749 // really want an 8-bit or 32-bit register, map to the appropriate register 12750 // class and return the appropriate register. 12751 if (Res.second == X86::GR16RegisterClass) { 12752 if (VT == MVT::i8) { 12753 unsigned DestReg = 0; 12754 switch (Res.first) { 12755 default: break; 12756 case X86::AX: DestReg = X86::AL; break; 12757 case X86::DX: DestReg = X86::DL; break; 12758 case X86::CX: DestReg = X86::CL; break; 12759 case X86::BX: DestReg = X86::BL; break; 12760 } 12761 if (DestReg) { 12762 Res.first = DestReg; 12763 Res.second = X86::GR8RegisterClass; 12764 } 12765 } else if (VT == MVT::i32) { 12766 unsigned DestReg = 0; 12767 switch (Res.first) { 12768 default: break; 12769 case X86::AX: DestReg = X86::EAX; break; 12770 case X86::DX: DestReg = X86::EDX; break; 12771 case X86::CX: DestReg = X86::ECX; break; 12772 case X86::BX: DestReg = X86::EBX; break; 12773 case X86::SI: DestReg = X86::ESI; break; 12774 case X86::DI: DestReg = X86::EDI; break; 12775 case X86::BP: DestReg = X86::EBP; break; 12776 case X86::SP: DestReg = X86::ESP; break; 12777 } 12778 if (DestReg) { 12779 Res.first = DestReg; 12780 Res.second = X86::GR32RegisterClass; 12781 } 12782 } else if (VT == MVT::i64) { 12783 unsigned DestReg = 0; 12784 switch (Res.first) { 12785 default: break; 12786 case X86::AX: DestReg = X86::RAX; break; 12787 case X86::DX: DestReg = X86::RDX; break; 12788 case X86::CX: DestReg = X86::RCX; break; 12789 case X86::BX: DestReg = X86::RBX; break; 12790 case X86::SI: DestReg = X86::RSI; break; 12791 case X86::DI: DestReg = X86::RDI; break; 12792 case X86::BP: DestReg = X86::RBP; break; 12793 case X86::SP: DestReg = X86::RSP; break; 12794 } 12795 if (DestReg) { 12796 Res.first = DestReg; 12797 Res.second = X86::GR64RegisterClass; 12798 } 12799 } 12800 } else if (Res.second == X86::FR32RegisterClass || 12801 Res.second == X86::FR64RegisterClass || 12802 Res.second == X86::VR128RegisterClass) { 12803 // Handle references to XMM physical registers that got mapped into the 12804 // wrong class. This can happen with constraints like {xmm0} where the 12805 // target independent register mapper will just pick the first match it can 12806 // find, ignoring the required type. 12807 if (VT == MVT::f32) 12808 Res.second = X86::FR32RegisterClass; 12809 else if (VT == MVT::f64) 12810 Res.second = X86::FR64RegisterClass; 12811 else if (X86::VR128RegisterClass->hasType(VT)) 12812 Res.second = X86::VR128RegisterClass; 12813 } 12814 12815 return Res; 12816} 12817