X86ISelLowering.cpp revision 0e6d230abdbf6ba67a2676c118431a4df8fb15dd
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 75/// sets things up to match to an AVX VEXTRACTF128 instruction or a 76/// simple subregister reference. Idx is an index in the 128 bits we 77/// want. It need not be aligned to a 128-bit bounday. That makes 78/// lowering EXTRACT_VECTOR_ELT operations easier. 79static SDValue Extract128BitVector(SDValue Vec, 80 SDValue Idx, 81 SelectionDAG &DAG, 82 DebugLoc dl) { 83 EVT VT = Vec.getValueType(); 84 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 85 EVT ElVT = VT.getVectorElementType(); 86 int Factor = VT.getSizeInBits()/128; 87 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 88 VT.getVectorNumElements()/Factor); 89 90 // Extract from UNDEF is UNDEF. 91 if (Vec.getOpcode() == ISD::UNDEF) 92 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 93 94 if (isa<ConstantSDNode>(Idx)) { 95 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 96 97 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 98 // we can match to VEXTRACTF128. 99 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 100 101 // This is the index of the first element of the 128-bit chunk 102 // we want. 103 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 104 * ElemsPerChunk); 105 106 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 108 VecIdx); 109 110 return Result; 111 } 112 113 return SDValue(); 114} 115 116/// Generate a DAG to put 128-bits into a vector > 128 bits. This 117/// sets things up to match to an AVX VINSERTF128 instruction or a 118/// simple superregister reference. Idx is an index in the 128 bits 119/// we want. It need not be aligned to a 128-bit bounday. That makes 120/// lowering INSERT_VECTOR_ELT operations easier. 121static SDValue Insert128BitVector(SDValue Result, 122 SDValue Vec, 123 SDValue Idx, 124 SelectionDAG &DAG, 125 DebugLoc dl) { 126 if (isa<ConstantSDNode>(Idx)) { 127 EVT VT = Vec.getValueType(); 128 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 129 130 EVT ElVT = VT.getVectorElementType(); 131 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 132 EVT ResultVT = Result.getValueType(); 133 134 // Insert the relevant 128 bits. 135 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 136 137 // This is the index of the first element of the 128-bit chunk 138 // we want. 139 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 140 * ElemsPerChunk); 141 142 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 143 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 144 VecIdx); 145 return Result; 146 } 147 148 return SDValue(); 149} 150 151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 152 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 153 bool is64Bit = Subtarget->is64Bit(); 154 155 if (Subtarget->isTargetEnvMacho()) { 156 if (is64Bit) 157 return new X8664_MachoTargetObjectFile(); 158 return new TargetLoweringObjectFileMachO(); 159 } 160 161 if (Subtarget->isTargetELF()) 162 return new TargetLoweringObjectFileELF(); 163 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 164 return new TargetLoweringObjectFileCOFF(); 165 llvm_unreachable("unknown subtarget type"); 166} 167 168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 169 : TargetLowering(TM, createTLOF(TM)) { 170 Subtarget = &TM.getSubtarget<X86Subtarget>(); 171 X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX(); 172 X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX(); 173 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 174 175 RegInfo = TM.getRegisterInfo(); 176 TD = getTargetData(); 177 178 // Set up the TargetLowering object. 179 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 180 181 // X86 is weird, it always uses i8 for shift amounts and setcc results. 182 setBooleanContents(ZeroOrOneBooleanContent); 183 184 // For 64-bit since we have so many registers use the ILP scheduler, for 185 // 32-bit code use the register pressure specific scheduling. 186 if (Subtarget->is64Bit()) 187 setSchedulingPreference(Sched::ILP); 188 else 189 setSchedulingPreference(Sched::RegPressure); 190 setStackPointerRegisterToSaveRestore(X86StackPtr); 191 192 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 193 // Setup Windows compiler runtime calls. 194 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 195 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 196 setLibcallName(RTLIB::SREM_I64, "_allrem"); 197 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 198 setLibcallName(RTLIB::MUL_I64, "_allmul"); 199 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 200 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 201 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 202 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 203 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 207 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 225 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 226 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 257 } else if (!UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!UseSoftFloat) { 315 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 327 if (!X86ScalarSSEf64) { 328 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 329 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 330 if (Subtarget->is64Bit()) { 331 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 332 // Without SSE, i64->f64 goes through memory. 333 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 334 } 335 } 336 337 // Scalar integer divide and remainder are lowered to use operations that 338 // produce two results, to match the available instructions. This exposes 339 // the two-result form to trivial CSE, which is able to combine x/y and x%y 340 // into a single instruction. 341 // 342 // Scalar integer multiply-high is also lowered to use two-result 343 // operations, to match the available instructions. However, plain multiply 344 // (low) operations are left as Legal, as there are single-result 345 // instructions for this in x86. Using the two-result multiply instructions 346 // when both high and low results are needed must be arranged by dagcombine. 347 for (unsigned i = 0, e = 4; i != e; ++i) { 348 MVT VT = IntVTs[i]; 349 setOperationAction(ISD::MULHS, VT, Expand); 350 setOperationAction(ISD::MULHU, VT, Expand); 351 setOperationAction(ISD::SDIV, VT, Expand); 352 setOperationAction(ISD::UDIV, VT, Expand); 353 setOperationAction(ISD::SREM, VT, Expand); 354 setOperationAction(ISD::UREM, VT, Expand); 355 356 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 357 setOperationAction(ISD::ADDC, VT, Custom); 358 setOperationAction(ISD::ADDE, VT, Custom); 359 setOperationAction(ISD::SUBC, VT, Custom); 360 setOperationAction(ISD::SUBE, VT, Custom); 361 } 362 363 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 364 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 365 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 366 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 367 if (Subtarget->is64Bit()) 368 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 372 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 373 setOperationAction(ISD::FREM , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f64 , Expand); 375 setOperationAction(ISD::FREM , MVT::f80 , Expand); 376 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 377 378 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 379 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 380 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 381 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 382 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 383 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 384 if (Subtarget->is64Bit()) { 385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 386 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 387 } 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 401 402 // These should be promoted to a larger select which is supported. 403 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 404 // X86 wants to expand cmov itself. 405 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 406 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 407 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 408 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 409 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 411 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 412 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 414 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 415 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 419 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 420 } 421 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 422 423 // Darwin ABI issue. 424 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 425 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 426 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 427 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 430 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 431 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 432 if (Subtarget->is64Bit()) { 433 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 434 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 435 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 436 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 437 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 438 } 439 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 440 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 441 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 442 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 443 if (Subtarget->is64Bit()) { 444 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 445 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 446 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 447 } 448 449 if (Subtarget->hasXMM()) 450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 451 452 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 453 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 454 455 // On X86 and X86-64, atomic operations are lowered to locked instructions. 456 // Locked instructions, in turn, have implicit fence semantics (all memory 457 // operations are flushed before issuing the locked instruction, and they 458 // are not buffered), so we can fold away the common pattern of 459 // fence-atomic-fence. 460 setShouldFoldAtomicFences(true); 461 462 // Expand certain atomics 463 for (unsigned i = 0, e = 4; i != e; ++i) { 464 MVT VT = IntVTs[i]; 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 466 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 467 } 468 469 if (!Subtarget->is64Bit()) { 470 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 471 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 472 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 473 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 474 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 475 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 476 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 477 } 478 479 // FIXME - use subtarget debug flags 480 if (!Subtarget->isTargetDarwin() && 481 !Subtarget->isTargetELF() && 482 !Subtarget->isTargetCygMing()) { 483 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 484 } 485 486 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 487 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 488 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 489 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 490 if (Subtarget->is64Bit()) { 491 setExceptionPointerRegister(X86::RAX); 492 setExceptionSelectorRegister(X86::RDX); 493 } else { 494 setExceptionPointerRegister(X86::EAX); 495 setExceptionSelectorRegister(X86::EDX); 496 } 497 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 498 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 499 500 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 501 502 setOperationAction(ISD::TRAP, MVT::Other, Legal); 503 504 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 505 setOperationAction(ISD::VASTART , MVT::Other, Custom); 506 setOperationAction(ISD::VAEND , MVT::Other, Expand); 507 if (Subtarget->is64Bit()) { 508 setOperationAction(ISD::VAARG , MVT::Other, Custom); 509 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 510 } else { 511 setOperationAction(ISD::VAARG , MVT::Other, Expand); 512 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 513 } 514 515 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 516 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 517 setOperationAction(ISD::DYNAMIC_STACKALLOC, 518 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 519 (Subtarget->isTargetCOFF() 520 && !Subtarget->isTargetEnvMacho() 521 ? Custom : Expand)); 522 523 if (!UseSoftFloat && X86ScalarSSEf64) { 524 // f32 and f64 use SSE. 525 // Set up the FP register classes. 526 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 527 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 528 529 // Use ANDPD to simulate FABS. 530 setOperationAction(ISD::FABS , MVT::f64, Custom); 531 setOperationAction(ISD::FABS , MVT::f32, Custom); 532 533 // Use XORP to simulate FNEG. 534 setOperationAction(ISD::FNEG , MVT::f64, Custom); 535 setOperationAction(ISD::FNEG , MVT::f32, Custom); 536 537 // Use ANDPD and ORPD to simulate FCOPYSIGN. 538 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 539 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 540 541 // Lower this to FGETSIGNx86 plus an AND. 542 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 543 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 544 545 // We don't support sin/cos/fmod 546 setOperationAction(ISD::FSIN , MVT::f64, Expand); 547 setOperationAction(ISD::FCOS , MVT::f64, Expand); 548 setOperationAction(ISD::FSIN , MVT::f32, Expand); 549 setOperationAction(ISD::FCOS , MVT::f32, Expand); 550 551 // Expand FP immediates into loads from the stack, except for the special 552 // cases we handle. 553 addLegalFPImmediate(APFloat(+0.0)); // xorpd 554 addLegalFPImmediate(APFloat(+0.0f)); // xorps 555 } else if (!UseSoftFloat && X86ScalarSSEf32) { 556 // Use SSE for f32, x87 for f64. 557 // Set up the FP register classes. 558 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 559 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 560 561 // Use ANDPS to simulate FABS. 562 setOperationAction(ISD::FABS , MVT::f32, Custom); 563 564 // Use XORP to simulate FNEG. 565 setOperationAction(ISD::FNEG , MVT::f32, Custom); 566 567 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 568 569 // Use ANDPS and ORPS to simulate FCOPYSIGN. 570 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 571 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 572 573 // We don't support sin/cos/fmod 574 setOperationAction(ISD::FSIN , MVT::f32, Expand); 575 setOperationAction(ISD::FCOS , MVT::f32, Expand); 576 577 // Special cases we handle for FP constants. 578 addLegalFPImmediate(APFloat(+0.0f)); // xorps 579 addLegalFPImmediate(APFloat(+0.0)); // FLD0 580 addLegalFPImmediate(APFloat(+1.0)); // FLD1 581 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 582 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 583 584 if (!UnsafeFPMath) { 585 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 586 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 587 } 588 } else if (!UseSoftFloat) { 589 // f32 and f64 in x87. 590 // Set up the FP register classes. 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 593 594 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 595 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 596 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 597 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 598 599 if (!UnsafeFPMath) { 600 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 601 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 602 } 603 addLegalFPImmediate(APFloat(+0.0)); // FLD0 604 addLegalFPImmediate(APFloat(+1.0)); // FLD1 605 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 606 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 607 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 608 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 609 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 610 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 611 } 612 613 // We don't support FMA. 614 setOperationAction(ISD::FMA, MVT::f64, Expand); 615 setOperationAction(ISD::FMA, MVT::f32, Expand); 616 617 // Long double always uses X87. 618 if (!UseSoftFloat) { 619 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 620 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 621 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 622 { 623 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 624 addLegalFPImmediate(TmpFlt); // FLD0 625 TmpFlt.changeSign(); 626 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 627 628 bool ignored; 629 APFloat TmpFlt2(+1.0); 630 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 631 &ignored); 632 addLegalFPImmediate(TmpFlt2); // FLD1 633 TmpFlt2.changeSign(); 634 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 635 } 636 637 if (!UnsafeFPMath) { 638 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 639 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 640 } 641 642 setOperationAction(ISD::FMA, MVT::f80, Expand); 643 } 644 645 // Always use a library call for pow. 646 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 647 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 648 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 649 650 setOperationAction(ISD::FLOG, MVT::f80, Expand); 651 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 652 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 653 setOperationAction(ISD::FEXP, MVT::f80, Expand); 654 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 655 656 // First set operation action for all vector types to either promote 657 // (for widening) or expand (for scalarization). Then we will selectively 658 // turn on ones that can be effectively codegen'd. 659 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 660 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 661 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 662 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 663 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 664 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 665 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 666 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 667 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 668 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 669 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 670 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 671 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 672 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 673 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 674 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 675 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 676 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 677 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 678 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 679 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 680 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 681 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 682 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 683 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 684 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 685 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 686 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 687 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 702 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 704 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 705 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 711 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 715 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 716 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 717 setTruncStoreAction((MVT::SimpleValueType)VT, 718 (MVT::SimpleValueType)InnerVT, Expand); 719 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 720 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 721 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 722 } 723 724 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 725 // with -msoft-float, disable use of MMX as well. 726 if (!UseSoftFloat && Subtarget->hasMMX()) { 727 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 728 // No operations on x86mmx supported, everything uses intrinsics. 729 } 730 731 // MMX-sized vectors (other than x86mmx) are expected to be expanded 732 // into smaller operations. 733 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 734 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 735 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 736 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 737 setOperationAction(ISD::AND, MVT::v8i8, Expand); 738 setOperationAction(ISD::AND, MVT::v4i16, Expand); 739 setOperationAction(ISD::AND, MVT::v2i32, Expand); 740 setOperationAction(ISD::AND, MVT::v1i64, Expand); 741 setOperationAction(ISD::OR, MVT::v8i8, Expand); 742 setOperationAction(ISD::OR, MVT::v4i16, Expand); 743 setOperationAction(ISD::OR, MVT::v2i32, Expand); 744 setOperationAction(ISD::OR, MVT::v1i64, Expand); 745 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 746 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 747 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 748 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 750 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 751 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 752 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 753 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 754 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 755 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 756 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 757 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 758 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 759 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 760 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 761 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 762 763 if (!UseSoftFloat && Subtarget->hasXMM()) { 764 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 765 766 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 767 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 768 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 769 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 770 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 771 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 772 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 773 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 774 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 775 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 776 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 777 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 778 } 779 780 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 781 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 782 783 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 784 // registers cannot be used even for integer operations. 785 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 786 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 787 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 788 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 789 790 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 791 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 792 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 793 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 794 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 795 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 796 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 797 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 798 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 799 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 800 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 801 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 802 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 803 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 804 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 805 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 806 807 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 808 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 809 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 810 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 811 812 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 813 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 814 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 819 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 820 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 821 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 822 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 823 824 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 825 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 826 EVT VT = (MVT::SimpleValueType)i; 827 // Do not attempt to custom lower non-power-of-2 vectors 828 if (!isPowerOf2_32(VT.getVectorNumElements())) 829 continue; 830 // Do not attempt to custom lower non-128-bit vectors 831 if (!VT.is128BitVector()) 832 continue; 833 setOperationAction(ISD::BUILD_VECTOR, 834 VT.getSimpleVT().SimpleTy, Custom); 835 setOperationAction(ISD::VECTOR_SHUFFLE, 836 VT.getSimpleVT().SimpleTy, Custom); 837 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 838 VT.getSimpleVT().SimpleTy, Custom); 839 } 840 841 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 842 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 843 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 844 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 847 848 if (Subtarget->is64Bit()) { 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 851 } 852 853 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 854 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 855 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 856 EVT VT = SVT; 857 858 // Do not attempt to promote non-128-bit vectors 859 if (!VT.is128BitVector()) 860 continue; 861 862 setOperationAction(ISD::AND, SVT, Promote); 863 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 864 setOperationAction(ISD::OR, SVT, Promote); 865 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 866 setOperationAction(ISD::XOR, SVT, Promote); 867 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 868 setOperationAction(ISD::LOAD, SVT, Promote); 869 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 870 setOperationAction(ISD::SELECT, SVT, Promote); 871 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 872 } 873 874 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 875 876 // Custom lower v2i64 and v2f64 selects. 877 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 878 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 879 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 880 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 881 882 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 883 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 884 } 885 886 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 887 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 888 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 889 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 890 setOperationAction(ISD::FRINT, MVT::f32, Legal); 891 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 892 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 893 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 894 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 895 setOperationAction(ISD::FRINT, MVT::f64, Legal); 896 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 897 898 // FIXME: Do we need to handle scalar-to-vector here? 899 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 900 901 // Can turn SHL into an integer multiply. 902 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 903 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 904 905 // i8 and i16 vectors are custom , because the source register and source 906 // source memory operand types are not the same width. f32 vectors are 907 // custom since the immediate controlling the insert encodes additional 908 // information. 909 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 913 914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 915 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 916 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 917 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 918 919 if (Subtarget->is64Bit()) { 920 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 921 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 922 } 923 } 924 925 if (Subtarget->hasSSE2() || Subtarget->hasAVX()) { 926 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 927 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 928 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 929 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 930 931 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 932 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 933 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 934 935 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 936 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 937 } 938 939 if (Subtarget->hasSSE42() || Subtarget->hasAVX()) 940 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 941 942 if (!UseSoftFloat && Subtarget->hasAVX()) { 943 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 944 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 945 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 946 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 947 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 948 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 949 950 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 951 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 952 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 953 954 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 955 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 956 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 957 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 958 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 959 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 960 961 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 962 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 963 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 964 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 965 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 966 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 967 968 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 969 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 970 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 971 972 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); 973 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); 974 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 975 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 976 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); 977 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); 978 979 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 980 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 981 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 982 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 983 984 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 985 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 986 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 987 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 988 989 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 990 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 991 992 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 993 setOperationAction(ISD::VSETCC, MVT::v4i64, Custom); 994 995 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 996 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 997 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 998 999 // Custom lower several nodes for 256-bit types. 1000 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1001 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 1002 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1003 EVT VT = SVT; 1004 1005 // Extract subvector is special because the value type 1006 // (result) is 128-bit but the source is 256-bit wide. 1007 if (VT.is128BitVector()) 1008 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 1009 1010 // Do not attempt to custom lower other non-256-bit vectors 1011 if (!VT.is256BitVector()) 1012 continue; 1013 1014 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 1015 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 1016 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 1017 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 1018 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 1019 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 1020 } 1021 1022 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1023 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 1024 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1025 EVT VT = SVT; 1026 1027 // Do not attempt to promote non-256-bit vectors 1028 if (!VT.is256BitVector()) 1029 continue; 1030 1031 setOperationAction(ISD::AND, SVT, Promote); 1032 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1033 setOperationAction(ISD::OR, SVT, Promote); 1034 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1035 setOperationAction(ISD::XOR, SVT, Promote); 1036 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1037 setOperationAction(ISD::LOAD, SVT, Promote); 1038 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1039 setOperationAction(ISD::SELECT, SVT, Promote); 1040 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1041 } 1042 } 1043 1044 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1045 // of this type with custom code. 1046 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1047 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1048 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); 1049 } 1050 1051 // We want to custom lower some of our intrinsics. 1052 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1053 1054 1055 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1056 // handle type legalization for these operations here. 1057 // 1058 // FIXME: We really should do custom legalization for addition and 1059 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1060 // than generic legalization for 64-bit multiplication-with-overflow, though. 1061 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1062 // Add/Sub/Mul with overflow operations are custom lowered. 1063 MVT VT = IntVTs[i]; 1064 setOperationAction(ISD::SADDO, VT, Custom); 1065 setOperationAction(ISD::UADDO, VT, Custom); 1066 setOperationAction(ISD::SSUBO, VT, Custom); 1067 setOperationAction(ISD::USUBO, VT, Custom); 1068 setOperationAction(ISD::SMULO, VT, Custom); 1069 setOperationAction(ISD::UMULO, VT, Custom); 1070 } 1071 1072 // There are no 8-bit 3-address imul/mul instructions 1073 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1074 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1075 1076 if (!Subtarget->is64Bit()) { 1077 // These libcalls are not available in 32-bit. 1078 setLibcallName(RTLIB::SHL_I128, 0); 1079 setLibcallName(RTLIB::SRL_I128, 0); 1080 setLibcallName(RTLIB::SRA_I128, 0); 1081 } 1082 1083 // We have target-specific dag combine patterns for the following nodes: 1084 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1085 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1086 setTargetDAGCombine(ISD::BUILD_VECTOR); 1087 setTargetDAGCombine(ISD::SELECT); 1088 setTargetDAGCombine(ISD::SHL); 1089 setTargetDAGCombine(ISD::SRA); 1090 setTargetDAGCombine(ISD::SRL); 1091 setTargetDAGCombine(ISD::OR); 1092 setTargetDAGCombine(ISD::AND); 1093 setTargetDAGCombine(ISD::ADD); 1094 setTargetDAGCombine(ISD::SUB); 1095 setTargetDAGCombine(ISD::STORE); 1096 setTargetDAGCombine(ISD::ZERO_EXTEND); 1097 setTargetDAGCombine(ISD::SINT_TO_FP); 1098 if (Subtarget->is64Bit()) 1099 setTargetDAGCombine(ISD::MUL); 1100 1101 computeRegisterProperties(); 1102 1103 // On Darwin, -Os means optimize for size without hurting performance, 1104 // do not reduce the limit. 1105 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1106 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1107 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1108 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1109 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1110 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1111 setPrefLoopAlignment(16); 1112 benefitFromCodePlacementOpt = true; 1113 1114 setPrefFunctionAlignment(4); 1115} 1116 1117 1118MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1119 return MVT::i8; 1120} 1121 1122 1123/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1124/// the desired ByVal argument alignment. 1125static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1126 if (MaxAlign == 16) 1127 return; 1128 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1129 if (VTy->getBitWidth() == 128) 1130 MaxAlign = 16; 1131 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1132 unsigned EltAlign = 0; 1133 getMaxByValAlign(ATy->getElementType(), EltAlign); 1134 if (EltAlign > MaxAlign) 1135 MaxAlign = EltAlign; 1136 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1137 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1138 unsigned EltAlign = 0; 1139 getMaxByValAlign(STy->getElementType(i), EltAlign); 1140 if (EltAlign > MaxAlign) 1141 MaxAlign = EltAlign; 1142 if (MaxAlign == 16) 1143 break; 1144 } 1145 } 1146 return; 1147} 1148 1149/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1150/// function arguments in the caller parameter area. For X86, aggregates 1151/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1152/// are at 4-byte boundaries. 1153unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1154 if (Subtarget->is64Bit()) { 1155 // Max of 8 and alignment of type. 1156 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1157 if (TyAlign > 8) 1158 return TyAlign; 1159 return 8; 1160 } 1161 1162 unsigned Align = 4; 1163 if (Subtarget->hasXMM()) 1164 getMaxByValAlign(Ty, Align); 1165 return Align; 1166} 1167 1168/// getOptimalMemOpType - Returns the target specific optimal type for load 1169/// and store operations as a result of memset, memcpy, and memmove 1170/// lowering. If DstAlign is zero that means it's safe to destination 1171/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1172/// means there isn't a need to check it against alignment requirement, 1173/// probably because the source does not need to be loaded. If 1174/// 'NonScalarIntSafe' is true, that means it's safe to return a 1175/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1176/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1177/// constant so it does not need to be loaded. 1178/// It returns EVT::Other if the type should be determined using generic 1179/// target-independent logic. 1180EVT 1181X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1182 unsigned DstAlign, unsigned SrcAlign, 1183 bool NonScalarIntSafe, 1184 bool MemcpyStrSrc, 1185 MachineFunction &MF) const { 1186 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1187 // linux. This is because the stack realignment code can't handle certain 1188 // cases like PR2962. This should be removed when PR2962 is fixed. 1189 const Function *F = MF.getFunction(); 1190 if (NonScalarIntSafe && 1191 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1192 if (Size >= 16 && 1193 (Subtarget->isUnalignedMemAccessFast() || 1194 ((DstAlign == 0 || DstAlign >= 16) && 1195 (SrcAlign == 0 || SrcAlign >= 16))) && 1196 Subtarget->getStackAlignment() >= 16) { 1197 if (Subtarget->hasSSE2()) 1198 return MVT::v4i32; 1199 if (Subtarget->hasSSE1()) 1200 return MVT::v4f32; 1201 } else if (!MemcpyStrSrc && Size >= 8 && 1202 !Subtarget->is64Bit() && 1203 Subtarget->getStackAlignment() >= 8 && 1204 Subtarget->hasXMMInt()) { 1205 // Do not use f64 to lower memcpy if source is string constant. It's 1206 // better to use i32 to avoid the loads. 1207 return MVT::f64; 1208 } 1209 } 1210 if (Subtarget->is64Bit() && Size >= 8) 1211 return MVT::i64; 1212 return MVT::i32; 1213} 1214 1215/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1216/// current function. The returned value is a member of the 1217/// MachineJumpTableInfo::JTEntryKind enum. 1218unsigned X86TargetLowering::getJumpTableEncoding() const { 1219 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1220 // symbol. 1221 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1222 Subtarget->isPICStyleGOT()) 1223 return MachineJumpTableInfo::EK_Custom32; 1224 1225 // Otherwise, use the normal jump table encoding heuristics. 1226 return TargetLowering::getJumpTableEncoding(); 1227} 1228 1229const MCExpr * 1230X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1231 const MachineBasicBlock *MBB, 1232 unsigned uid,MCContext &Ctx) const{ 1233 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1234 Subtarget->isPICStyleGOT()); 1235 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1236 // entries. 1237 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1238 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1239} 1240 1241/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1242/// jumptable. 1243SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1244 SelectionDAG &DAG) const { 1245 if (!Subtarget->is64Bit()) 1246 // This doesn't have DebugLoc associated with it, but is not really the 1247 // same as a Register. 1248 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1249 return Table; 1250} 1251 1252/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1253/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1254/// MCExpr. 1255const MCExpr *X86TargetLowering:: 1256getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1257 MCContext &Ctx) const { 1258 // X86-64 uses RIP relative addressing based on the jump table label. 1259 if (Subtarget->isPICStyleRIPRel()) 1260 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1261 1262 // Otherwise, the reference is relative to the PIC base. 1263 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1264} 1265 1266// FIXME: Why this routine is here? Move to RegInfo! 1267std::pair<const TargetRegisterClass*, uint8_t> 1268X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1269 const TargetRegisterClass *RRC = 0; 1270 uint8_t Cost = 1; 1271 switch (VT.getSimpleVT().SimpleTy) { 1272 default: 1273 return TargetLowering::findRepresentativeClass(VT); 1274 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1275 RRC = (Subtarget->is64Bit() 1276 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1277 break; 1278 case MVT::x86mmx: 1279 RRC = X86::VR64RegisterClass; 1280 break; 1281 case MVT::f32: case MVT::f64: 1282 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1283 case MVT::v4f32: case MVT::v2f64: 1284 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1285 case MVT::v4f64: 1286 RRC = X86::VR128RegisterClass; 1287 break; 1288 } 1289 return std::make_pair(RRC, Cost); 1290} 1291 1292bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1293 unsigned &Offset) const { 1294 if (!Subtarget->isTargetLinux()) 1295 return false; 1296 1297 if (Subtarget->is64Bit()) { 1298 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1299 Offset = 0x28; 1300 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1301 AddressSpace = 256; 1302 else 1303 AddressSpace = 257; 1304 } else { 1305 // %gs:0x14 on i386 1306 Offset = 0x14; 1307 AddressSpace = 256; 1308 } 1309 return true; 1310} 1311 1312 1313//===----------------------------------------------------------------------===// 1314// Return Value Calling Convention Implementation 1315//===----------------------------------------------------------------------===// 1316 1317#include "X86GenCallingConv.inc" 1318 1319bool 1320X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1321 MachineFunction &MF, bool isVarArg, 1322 const SmallVectorImpl<ISD::OutputArg> &Outs, 1323 LLVMContext &Context) const { 1324 SmallVector<CCValAssign, 16> RVLocs; 1325 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1326 RVLocs, Context); 1327 return CCInfo.CheckReturn(Outs, RetCC_X86); 1328} 1329 1330SDValue 1331X86TargetLowering::LowerReturn(SDValue Chain, 1332 CallingConv::ID CallConv, bool isVarArg, 1333 const SmallVectorImpl<ISD::OutputArg> &Outs, 1334 const SmallVectorImpl<SDValue> &OutVals, 1335 DebugLoc dl, SelectionDAG &DAG) const { 1336 MachineFunction &MF = DAG.getMachineFunction(); 1337 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1338 1339 SmallVector<CCValAssign, 16> RVLocs; 1340 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1341 RVLocs, *DAG.getContext()); 1342 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1343 1344 // Add the regs to the liveout set for the function. 1345 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1346 for (unsigned i = 0; i != RVLocs.size(); ++i) 1347 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1348 MRI.addLiveOut(RVLocs[i].getLocReg()); 1349 1350 SDValue Flag; 1351 1352 SmallVector<SDValue, 6> RetOps; 1353 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1354 // Operand #1 = Bytes To Pop 1355 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1356 MVT::i16)); 1357 1358 // Copy the result values into the output registers. 1359 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1360 CCValAssign &VA = RVLocs[i]; 1361 assert(VA.isRegLoc() && "Can only return in registers!"); 1362 SDValue ValToCopy = OutVals[i]; 1363 EVT ValVT = ValToCopy.getValueType(); 1364 1365 // If this is x86-64, and we disabled SSE, we can't return FP values, 1366 // or SSE or MMX vectors. 1367 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1368 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1369 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1370 report_fatal_error("SSE register return with SSE disabled"); 1371 } 1372 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1373 // llvm-gcc has never done it right and no one has noticed, so this 1374 // should be OK for now. 1375 if (ValVT == MVT::f64 && 1376 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1377 report_fatal_error("SSE2 register return with SSE2 disabled"); 1378 1379 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1380 // the RET instruction and handled by the FP Stackifier. 1381 if (VA.getLocReg() == X86::ST0 || 1382 VA.getLocReg() == X86::ST1) { 1383 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1384 // change the value to the FP stack register class. 1385 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1386 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1387 RetOps.push_back(ValToCopy); 1388 // Don't emit a copytoreg. 1389 continue; 1390 } 1391 1392 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1393 // which is returned in RAX / RDX. 1394 if (Subtarget->is64Bit()) { 1395 if (ValVT == MVT::x86mmx) { 1396 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1397 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1398 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1399 ValToCopy); 1400 // If we don't have SSE2 available, convert to v4f32 so the generated 1401 // register is legal. 1402 if (!Subtarget->hasSSE2()) 1403 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1404 } 1405 } 1406 } 1407 1408 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1409 Flag = Chain.getValue(1); 1410 } 1411 1412 // The x86-64 ABI for returning structs by value requires that we copy 1413 // the sret argument into %rax for the return. We saved the argument into 1414 // a virtual register in the entry block, so now we copy the value out 1415 // and into %rax. 1416 if (Subtarget->is64Bit() && 1417 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1418 MachineFunction &MF = DAG.getMachineFunction(); 1419 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1420 unsigned Reg = FuncInfo->getSRetReturnReg(); 1421 assert(Reg && 1422 "SRetReturnReg should have been set in LowerFormalArguments()."); 1423 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1424 1425 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1426 Flag = Chain.getValue(1); 1427 1428 // RAX now acts like a return value. 1429 MRI.addLiveOut(X86::RAX); 1430 } 1431 1432 RetOps[0] = Chain; // Update chain. 1433 1434 // Add the flag if we have it. 1435 if (Flag.getNode()) 1436 RetOps.push_back(Flag); 1437 1438 return DAG.getNode(X86ISD::RET_FLAG, dl, 1439 MVT::Other, &RetOps[0], RetOps.size()); 1440} 1441 1442bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1443 if (N->getNumValues() != 1) 1444 return false; 1445 if (!N->hasNUsesOfValue(1, 0)) 1446 return false; 1447 1448 SDNode *Copy = *N->use_begin(); 1449 if (Copy->getOpcode() != ISD::CopyToReg && 1450 Copy->getOpcode() != ISD::FP_EXTEND) 1451 return false; 1452 1453 bool HasRet = false; 1454 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1455 UI != UE; ++UI) { 1456 if (UI->getOpcode() != X86ISD::RET_FLAG) 1457 return false; 1458 HasRet = true; 1459 } 1460 1461 return HasRet; 1462} 1463 1464EVT 1465X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1466 ISD::NodeType ExtendKind) const { 1467 MVT ReturnMVT; 1468 // TODO: Is this also valid on 32-bit? 1469 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1470 ReturnMVT = MVT::i8; 1471 else 1472 ReturnMVT = MVT::i32; 1473 1474 EVT MinVT = getRegisterType(Context, ReturnMVT); 1475 return VT.bitsLT(MinVT) ? MinVT : VT; 1476} 1477 1478/// LowerCallResult - Lower the result values of a call into the 1479/// appropriate copies out of appropriate physical registers. 1480/// 1481SDValue 1482X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1483 CallingConv::ID CallConv, bool isVarArg, 1484 const SmallVectorImpl<ISD::InputArg> &Ins, 1485 DebugLoc dl, SelectionDAG &DAG, 1486 SmallVectorImpl<SDValue> &InVals) const { 1487 1488 // Assign locations to each value returned by this call. 1489 SmallVector<CCValAssign, 16> RVLocs; 1490 bool Is64Bit = Subtarget->is64Bit(); 1491 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1492 getTargetMachine(), RVLocs, *DAG.getContext()); 1493 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1494 1495 // Copy all of the result registers out of their specified physreg. 1496 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1497 CCValAssign &VA = RVLocs[i]; 1498 EVT CopyVT = VA.getValVT(); 1499 1500 // If this is x86-64, and we disabled SSE, we can't return FP values 1501 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1502 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1503 report_fatal_error("SSE register return with SSE disabled"); 1504 } 1505 1506 SDValue Val; 1507 1508 // If this is a call to a function that returns an fp value on the floating 1509 // point stack, we must guarantee the the value is popped from the stack, so 1510 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1511 // if the return value is not used. We use the FpPOP_RETVAL instruction 1512 // instead. 1513 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1514 // If we prefer to use the value in xmm registers, copy it out as f80 and 1515 // use a truncate to move it from fp stack reg to xmm reg. 1516 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1517 SDValue Ops[] = { Chain, InFlag }; 1518 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1519 MVT::Other, MVT::Glue, Ops, 2), 1); 1520 Val = Chain.getValue(0); 1521 1522 // Round the f80 to the right size, which also moves it to the appropriate 1523 // xmm register. 1524 if (CopyVT != VA.getValVT()) 1525 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1526 // This truncation won't change the value. 1527 DAG.getIntPtrConstant(1)); 1528 } else { 1529 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1530 CopyVT, InFlag).getValue(1); 1531 Val = Chain.getValue(0); 1532 } 1533 InFlag = Chain.getValue(2); 1534 InVals.push_back(Val); 1535 } 1536 1537 return Chain; 1538} 1539 1540 1541//===----------------------------------------------------------------------===// 1542// C & StdCall & Fast Calling Convention implementation 1543//===----------------------------------------------------------------------===// 1544// StdCall calling convention seems to be standard for many Windows' API 1545// routines and around. It differs from C calling convention just a little: 1546// callee should clean up the stack, not caller. Symbols should be also 1547// decorated in some fancy way :) It doesn't support any vector arguments. 1548// For info on fast calling convention see Fast Calling Convention (tail call) 1549// implementation LowerX86_32FastCCCallTo. 1550 1551/// CallIsStructReturn - Determines whether a call uses struct return 1552/// semantics. 1553static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1554 if (Outs.empty()) 1555 return false; 1556 1557 return Outs[0].Flags.isSRet(); 1558} 1559 1560/// ArgsAreStructReturn - Determines whether a function uses struct 1561/// return semantics. 1562static bool 1563ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1564 if (Ins.empty()) 1565 return false; 1566 1567 return Ins[0].Flags.isSRet(); 1568} 1569 1570/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1571/// by "Src" to address "Dst" with size and alignment information specified by 1572/// the specific parameter attribute. The copy will be passed as a byval 1573/// function parameter. 1574static SDValue 1575CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1576 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1577 DebugLoc dl) { 1578 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1579 1580 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1581 /*isVolatile*/false, /*AlwaysInline=*/true, 1582 MachinePointerInfo(), MachinePointerInfo()); 1583} 1584 1585/// IsTailCallConvention - Return true if the calling convention is one that 1586/// supports tail call optimization. 1587static bool IsTailCallConvention(CallingConv::ID CC) { 1588 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1589} 1590 1591bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1592 if (!CI->isTailCall()) 1593 return false; 1594 1595 CallSite CS(CI); 1596 CallingConv::ID CalleeCC = CS.getCallingConv(); 1597 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1598 return false; 1599 1600 return true; 1601} 1602 1603/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1604/// a tailcall target by changing its ABI. 1605static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1606 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1607} 1608 1609SDValue 1610X86TargetLowering::LowerMemArgument(SDValue Chain, 1611 CallingConv::ID CallConv, 1612 const SmallVectorImpl<ISD::InputArg> &Ins, 1613 DebugLoc dl, SelectionDAG &DAG, 1614 const CCValAssign &VA, 1615 MachineFrameInfo *MFI, 1616 unsigned i) const { 1617 // Create the nodes corresponding to a load from this parameter slot. 1618 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1619 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1620 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1621 EVT ValVT; 1622 1623 // If value is passed by pointer we have address passed instead of the value 1624 // itself. 1625 if (VA.getLocInfo() == CCValAssign::Indirect) 1626 ValVT = VA.getLocVT(); 1627 else 1628 ValVT = VA.getValVT(); 1629 1630 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1631 // changed with more analysis. 1632 // In case of tail call optimization mark all arguments mutable. Since they 1633 // could be overwritten by lowering of arguments in case of a tail call. 1634 if (Flags.isByVal()) { 1635 unsigned Bytes = Flags.getByValSize(); 1636 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1637 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1638 return DAG.getFrameIndex(FI, getPointerTy()); 1639 } else { 1640 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1641 VA.getLocMemOffset(), isImmutable); 1642 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1643 return DAG.getLoad(ValVT, dl, Chain, FIN, 1644 MachinePointerInfo::getFixedStack(FI), 1645 false, false, 0); 1646 } 1647} 1648 1649SDValue 1650X86TargetLowering::LowerFormalArguments(SDValue Chain, 1651 CallingConv::ID CallConv, 1652 bool isVarArg, 1653 const SmallVectorImpl<ISD::InputArg> &Ins, 1654 DebugLoc dl, 1655 SelectionDAG &DAG, 1656 SmallVectorImpl<SDValue> &InVals) 1657 const { 1658 MachineFunction &MF = DAG.getMachineFunction(); 1659 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1660 1661 const Function* Fn = MF.getFunction(); 1662 if (Fn->hasExternalLinkage() && 1663 Subtarget->isTargetCygMing() && 1664 Fn->getName() == "main") 1665 FuncInfo->setForceFramePointer(true); 1666 1667 MachineFrameInfo *MFI = MF.getFrameInfo(); 1668 bool Is64Bit = Subtarget->is64Bit(); 1669 bool IsWin64 = Subtarget->isTargetWin64(); 1670 1671 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1672 "Var args not supported with calling convention fastcc or ghc"); 1673 1674 // Assign locations to all of the incoming arguments. 1675 SmallVector<CCValAssign, 16> ArgLocs; 1676 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1677 ArgLocs, *DAG.getContext()); 1678 1679 // Allocate shadow area for Win64 1680 if (IsWin64) { 1681 CCInfo.AllocateStack(32, 8); 1682 } 1683 1684 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1685 1686 unsigned LastVal = ~0U; 1687 SDValue ArgValue; 1688 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1689 CCValAssign &VA = ArgLocs[i]; 1690 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1691 // places. 1692 assert(VA.getValNo() != LastVal && 1693 "Don't support value assigned to multiple locs yet"); 1694 LastVal = VA.getValNo(); 1695 1696 if (VA.isRegLoc()) { 1697 EVT RegVT = VA.getLocVT(); 1698 TargetRegisterClass *RC = NULL; 1699 if (RegVT == MVT::i32) 1700 RC = X86::GR32RegisterClass; 1701 else if (Is64Bit && RegVT == MVT::i64) 1702 RC = X86::GR64RegisterClass; 1703 else if (RegVT == MVT::f32) 1704 RC = X86::FR32RegisterClass; 1705 else if (RegVT == MVT::f64) 1706 RC = X86::FR64RegisterClass; 1707 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1708 RC = X86::VR256RegisterClass; 1709 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1710 RC = X86::VR128RegisterClass; 1711 else if (RegVT == MVT::x86mmx) 1712 RC = X86::VR64RegisterClass; 1713 else 1714 llvm_unreachable("Unknown argument type!"); 1715 1716 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1717 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1718 1719 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1720 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1721 // right size. 1722 if (VA.getLocInfo() == CCValAssign::SExt) 1723 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1724 DAG.getValueType(VA.getValVT())); 1725 else if (VA.getLocInfo() == CCValAssign::ZExt) 1726 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1727 DAG.getValueType(VA.getValVT())); 1728 else if (VA.getLocInfo() == CCValAssign::BCvt) 1729 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1730 1731 if (VA.isExtInLoc()) { 1732 // Handle MMX values passed in XMM regs. 1733 if (RegVT.isVector()) { 1734 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1735 ArgValue); 1736 } else 1737 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1738 } 1739 } else { 1740 assert(VA.isMemLoc()); 1741 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1742 } 1743 1744 // If value is passed via pointer - do a load. 1745 if (VA.getLocInfo() == CCValAssign::Indirect) 1746 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1747 MachinePointerInfo(), false, false, 0); 1748 1749 InVals.push_back(ArgValue); 1750 } 1751 1752 // The x86-64 ABI for returning structs by value requires that we copy 1753 // the sret argument into %rax for the return. Save the argument into 1754 // a virtual register so that we can access it from the return points. 1755 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1756 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1757 unsigned Reg = FuncInfo->getSRetReturnReg(); 1758 if (!Reg) { 1759 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1760 FuncInfo->setSRetReturnReg(Reg); 1761 } 1762 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1763 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1764 } 1765 1766 unsigned StackSize = CCInfo.getNextStackOffset(); 1767 // Align stack specially for tail calls. 1768 if (FuncIsMadeTailCallSafe(CallConv)) 1769 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1770 1771 // If the function takes variable number of arguments, make a frame index for 1772 // the start of the first vararg value... for expansion of llvm.va_start. 1773 if (isVarArg) { 1774 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1775 CallConv != CallingConv::X86_ThisCall)) { 1776 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1777 } 1778 if (Is64Bit) { 1779 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1780 1781 // FIXME: We should really autogenerate these arrays 1782 static const unsigned GPR64ArgRegsWin64[] = { 1783 X86::RCX, X86::RDX, X86::R8, X86::R9 1784 }; 1785 static const unsigned GPR64ArgRegs64Bit[] = { 1786 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1787 }; 1788 static const unsigned XMMArgRegs64Bit[] = { 1789 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1790 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1791 }; 1792 const unsigned *GPR64ArgRegs; 1793 unsigned NumXMMRegs = 0; 1794 1795 if (IsWin64) { 1796 // The XMM registers which might contain var arg parameters are shadowed 1797 // in their paired GPR. So we only need to save the GPR to their home 1798 // slots. 1799 TotalNumIntRegs = 4; 1800 GPR64ArgRegs = GPR64ArgRegsWin64; 1801 } else { 1802 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1803 GPR64ArgRegs = GPR64ArgRegs64Bit; 1804 1805 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1806 } 1807 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1808 TotalNumIntRegs); 1809 1810 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1811 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1812 "SSE register cannot be used when SSE is disabled!"); 1813 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1814 "SSE register cannot be used when SSE is disabled!"); 1815 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1816 // Kernel mode asks for SSE to be disabled, so don't push them 1817 // on the stack. 1818 TotalNumXMMRegs = 0; 1819 1820 if (IsWin64) { 1821 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1822 // Get to the caller-allocated home save location. Add 8 to account 1823 // for the return address. 1824 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1825 FuncInfo->setRegSaveFrameIndex( 1826 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1827 // Fixup to set vararg frame on shadow area (4 x i64). 1828 if (NumIntRegs < 4) 1829 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1830 } else { 1831 // For X86-64, if there are vararg parameters that are passed via 1832 // registers, then we must store them to their spots on the stack so they 1833 // may be loaded by deferencing the result of va_next. 1834 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1835 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1836 FuncInfo->setRegSaveFrameIndex( 1837 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1838 false)); 1839 } 1840 1841 // Store the integer parameter registers. 1842 SmallVector<SDValue, 8> MemOps; 1843 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1844 getPointerTy()); 1845 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1846 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1847 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1848 DAG.getIntPtrConstant(Offset)); 1849 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1850 X86::GR64RegisterClass); 1851 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1852 SDValue Store = 1853 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1854 MachinePointerInfo::getFixedStack( 1855 FuncInfo->getRegSaveFrameIndex(), Offset), 1856 false, false, 0); 1857 MemOps.push_back(Store); 1858 Offset += 8; 1859 } 1860 1861 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1862 // Now store the XMM (fp + vector) parameter registers. 1863 SmallVector<SDValue, 11> SaveXMMOps; 1864 SaveXMMOps.push_back(Chain); 1865 1866 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1867 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1868 SaveXMMOps.push_back(ALVal); 1869 1870 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1871 FuncInfo->getRegSaveFrameIndex())); 1872 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1873 FuncInfo->getVarArgsFPOffset())); 1874 1875 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1876 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1877 X86::VR128RegisterClass); 1878 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1879 SaveXMMOps.push_back(Val); 1880 } 1881 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1882 MVT::Other, 1883 &SaveXMMOps[0], SaveXMMOps.size())); 1884 } 1885 1886 if (!MemOps.empty()) 1887 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1888 &MemOps[0], MemOps.size()); 1889 } 1890 } 1891 1892 // Some CCs need callee pop. 1893 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { 1894 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1895 } else { 1896 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1897 // If this is an sret function, the return should pop the hidden pointer. 1898 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1899 FuncInfo->setBytesToPopOnReturn(4); 1900 } 1901 1902 if (!Is64Bit) { 1903 // RegSaveFrameIndex is X86-64 only. 1904 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1905 if (CallConv == CallingConv::X86_FastCall || 1906 CallConv == CallingConv::X86_ThisCall) 1907 // fastcc functions can't have varargs. 1908 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1909 } 1910 1911 return Chain; 1912} 1913 1914SDValue 1915X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1916 SDValue StackPtr, SDValue Arg, 1917 DebugLoc dl, SelectionDAG &DAG, 1918 const CCValAssign &VA, 1919 ISD::ArgFlagsTy Flags) const { 1920 unsigned LocMemOffset = VA.getLocMemOffset(); 1921 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1922 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1923 if (Flags.isByVal()) 1924 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1925 1926 return DAG.getStore(Chain, dl, Arg, PtrOff, 1927 MachinePointerInfo::getStack(LocMemOffset), 1928 false, false, 0); 1929} 1930 1931/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1932/// optimization is performed and it is required. 1933SDValue 1934X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1935 SDValue &OutRetAddr, SDValue Chain, 1936 bool IsTailCall, bool Is64Bit, 1937 int FPDiff, DebugLoc dl) const { 1938 // Adjust the Return address stack slot. 1939 EVT VT = getPointerTy(); 1940 OutRetAddr = getReturnAddressFrameIndex(DAG); 1941 1942 // Load the "old" Return address. 1943 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1944 false, false, 0); 1945 return SDValue(OutRetAddr.getNode(), 1); 1946} 1947 1948/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1949/// optimization is performed and it is required (FPDiff!=0). 1950static SDValue 1951EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1952 SDValue Chain, SDValue RetAddrFrIdx, 1953 bool Is64Bit, int FPDiff, DebugLoc dl) { 1954 // Store the return address to the appropriate stack slot. 1955 if (!FPDiff) return Chain; 1956 // Calculate the new stack slot for the return address. 1957 int SlotSize = Is64Bit ? 8 : 4; 1958 int NewReturnAddrFI = 1959 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1960 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1961 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1962 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1963 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1964 false, false, 0); 1965 return Chain; 1966} 1967 1968SDValue 1969X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1970 CallingConv::ID CallConv, bool isVarArg, 1971 bool &isTailCall, 1972 const SmallVectorImpl<ISD::OutputArg> &Outs, 1973 const SmallVectorImpl<SDValue> &OutVals, 1974 const SmallVectorImpl<ISD::InputArg> &Ins, 1975 DebugLoc dl, SelectionDAG &DAG, 1976 SmallVectorImpl<SDValue> &InVals) const { 1977 MachineFunction &MF = DAG.getMachineFunction(); 1978 bool Is64Bit = Subtarget->is64Bit(); 1979 bool IsWin64 = Subtarget->isTargetWin64(); 1980 bool IsStructRet = CallIsStructReturn(Outs); 1981 bool IsSibcall = false; 1982 1983 if (isTailCall) { 1984 // Check if it's really possible to do a tail call. 1985 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1986 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1987 Outs, OutVals, Ins, DAG); 1988 1989 // Sibcalls are automatically detected tailcalls which do not require 1990 // ABI changes. 1991 if (!GuaranteedTailCallOpt && isTailCall) 1992 IsSibcall = true; 1993 1994 if (isTailCall) 1995 ++NumTailCalls; 1996 } 1997 1998 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1999 "Var args not supported with calling convention fastcc or ghc"); 2000 2001 // Analyze operands of the call, assigning locations to each operand. 2002 SmallVector<CCValAssign, 16> ArgLocs; 2003 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2004 ArgLocs, *DAG.getContext()); 2005 2006 // Allocate shadow area for Win64 2007 if (IsWin64) { 2008 CCInfo.AllocateStack(32, 8); 2009 } 2010 2011 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2012 2013 // Get a count of how many bytes are to be pushed on the stack. 2014 unsigned NumBytes = CCInfo.getNextStackOffset(); 2015 if (IsSibcall) 2016 // This is a sibcall. The memory operands are available in caller's 2017 // own caller's stack. 2018 NumBytes = 0; 2019 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 2020 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2021 2022 int FPDiff = 0; 2023 if (isTailCall && !IsSibcall) { 2024 // Lower arguments at fp - stackoffset + fpdiff. 2025 unsigned NumBytesCallerPushed = 2026 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2027 FPDiff = NumBytesCallerPushed - NumBytes; 2028 2029 // Set the delta of movement of the returnaddr stackslot. 2030 // But only set if delta is greater than previous delta. 2031 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2032 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2033 } 2034 2035 if (!IsSibcall) 2036 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2037 2038 SDValue RetAddrFrIdx; 2039 // Load return address for tail calls. 2040 if (isTailCall && FPDiff) 2041 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2042 Is64Bit, FPDiff, dl); 2043 2044 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2045 SmallVector<SDValue, 8> MemOpChains; 2046 SDValue StackPtr; 2047 2048 // Walk the register/memloc assignments, inserting copies/loads. In the case 2049 // of tail call optimization arguments are handle later. 2050 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2051 CCValAssign &VA = ArgLocs[i]; 2052 EVT RegVT = VA.getLocVT(); 2053 SDValue Arg = OutVals[i]; 2054 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2055 bool isByVal = Flags.isByVal(); 2056 2057 // Promote the value if needed. 2058 switch (VA.getLocInfo()) { 2059 default: llvm_unreachable("Unknown loc info!"); 2060 case CCValAssign::Full: break; 2061 case CCValAssign::SExt: 2062 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2063 break; 2064 case CCValAssign::ZExt: 2065 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2066 break; 2067 case CCValAssign::AExt: 2068 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2069 // Special case: passing MMX values in XMM registers. 2070 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2071 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2072 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2073 } else 2074 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2075 break; 2076 case CCValAssign::BCvt: 2077 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2078 break; 2079 case CCValAssign::Indirect: { 2080 // Store the argument. 2081 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2082 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2083 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2084 MachinePointerInfo::getFixedStack(FI), 2085 false, false, 0); 2086 Arg = SpillSlot; 2087 break; 2088 } 2089 } 2090 2091 if (VA.isRegLoc()) { 2092 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2093 if (isVarArg && IsWin64) { 2094 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2095 // shadow reg if callee is a varargs function. 2096 unsigned ShadowReg = 0; 2097 switch (VA.getLocReg()) { 2098 case X86::XMM0: ShadowReg = X86::RCX; break; 2099 case X86::XMM1: ShadowReg = X86::RDX; break; 2100 case X86::XMM2: ShadowReg = X86::R8; break; 2101 case X86::XMM3: ShadowReg = X86::R9; break; 2102 } 2103 if (ShadowReg) 2104 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2105 } 2106 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2107 assert(VA.isMemLoc()); 2108 if (StackPtr.getNode() == 0) 2109 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2110 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2111 dl, DAG, VA, Flags)); 2112 } 2113 } 2114 2115 if (!MemOpChains.empty()) 2116 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2117 &MemOpChains[0], MemOpChains.size()); 2118 2119 // Build a sequence of copy-to-reg nodes chained together with token chain 2120 // and flag operands which copy the outgoing args into registers. 2121 SDValue InFlag; 2122 // Tail call byval lowering might overwrite argument registers so in case of 2123 // tail call optimization the copies to registers are lowered later. 2124 if (!isTailCall) 2125 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2126 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2127 RegsToPass[i].second, InFlag); 2128 InFlag = Chain.getValue(1); 2129 } 2130 2131 if (Subtarget->isPICStyleGOT()) { 2132 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2133 // GOT pointer. 2134 if (!isTailCall) { 2135 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2136 DAG.getNode(X86ISD::GlobalBaseReg, 2137 DebugLoc(), getPointerTy()), 2138 InFlag); 2139 InFlag = Chain.getValue(1); 2140 } else { 2141 // If we are tail calling and generating PIC/GOT style code load the 2142 // address of the callee into ECX. The value in ecx is used as target of 2143 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2144 // for tail calls on PIC/GOT architectures. Normally we would just put the 2145 // address of GOT into ebx and then call target@PLT. But for tail calls 2146 // ebx would be restored (since ebx is callee saved) before jumping to the 2147 // target@PLT. 2148 2149 // Note: The actual moving to ECX is done further down. 2150 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2151 if (G && !G->getGlobal()->hasHiddenVisibility() && 2152 !G->getGlobal()->hasProtectedVisibility()) 2153 Callee = LowerGlobalAddress(Callee, DAG); 2154 else if (isa<ExternalSymbolSDNode>(Callee)) 2155 Callee = LowerExternalSymbol(Callee, DAG); 2156 } 2157 } 2158 2159 if (Is64Bit && isVarArg && !IsWin64) { 2160 // From AMD64 ABI document: 2161 // For calls that may call functions that use varargs or stdargs 2162 // (prototype-less calls or calls to functions containing ellipsis (...) in 2163 // the declaration) %al is used as hidden argument to specify the number 2164 // of SSE registers used. The contents of %al do not need to match exactly 2165 // the number of registers, but must be an ubound on the number of SSE 2166 // registers used and is in the range 0 - 8 inclusive. 2167 2168 // Count the number of XMM registers allocated. 2169 static const unsigned XMMArgRegs[] = { 2170 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2171 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2172 }; 2173 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2174 assert((Subtarget->hasXMM() || !NumXMMRegs) 2175 && "SSE registers cannot be used when SSE is disabled"); 2176 2177 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2178 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2179 InFlag = Chain.getValue(1); 2180 } 2181 2182 2183 // For tail calls lower the arguments to the 'real' stack slot. 2184 if (isTailCall) { 2185 // Force all the incoming stack arguments to be loaded from the stack 2186 // before any new outgoing arguments are stored to the stack, because the 2187 // outgoing stack slots may alias the incoming argument stack slots, and 2188 // the alias isn't otherwise explicit. This is slightly more conservative 2189 // than necessary, because it means that each store effectively depends 2190 // on every argument instead of just those arguments it would clobber. 2191 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2192 2193 SmallVector<SDValue, 8> MemOpChains2; 2194 SDValue FIN; 2195 int FI = 0; 2196 // Do not flag preceding copytoreg stuff together with the following stuff. 2197 InFlag = SDValue(); 2198 if (GuaranteedTailCallOpt) { 2199 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2200 CCValAssign &VA = ArgLocs[i]; 2201 if (VA.isRegLoc()) 2202 continue; 2203 assert(VA.isMemLoc()); 2204 SDValue Arg = OutVals[i]; 2205 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2206 // Create frame index. 2207 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2208 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2209 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2210 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2211 2212 if (Flags.isByVal()) { 2213 // Copy relative to framepointer. 2214 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2215 if (StackPtr.getNode() == 0) 2216 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2217 getPointerTy()); 2218 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2219 2220 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2221 ArgChain, 2222 Flags, DAG, dl)); 2223 } else { 2224 // Store relative to framepointer. 2225 MemOpChains2.push_back( 2226 DAG.getStore(ArgChain, dl, Arg, FIN, 2227 MachinePointerInfo::getFixedStack(FI), 2228 false, false, 0)); 2229 } 2230 } 2231 } 2232 2233 if (!MemOpChains2.empty()) 2234 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2235 &MemOpChains2[0], MemOpChains2.size()); 2236 2237 // Copy arguments to their registers. 2238 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2239 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2240 RegsToPass[i].second, InFlag); 2241 InFlag = Chain.getValue(1); 2242 } 2243 InFlag =SDValue(); 2244 2245 // Store the return address to the appropriate stack slot. 2246 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2247 FPDiff, dl); 2248 } 2249 2250 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2251 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2252 // In the 64-bit large code model, we have to make all calls 2253 // through a register, since the call instruction's 32-bit 2254 // pc-relative offset may not be large enough to hold the whole 2255 // address. 2256 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2257 // If the callee is a GlobalAddress node (quite common, every direct call 2258 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2259 // it. 2260 2261 // We should use extra load for direct calls to dllimported functions in 2262 // non-JIT mode. 2263 const GlobalValue *GV = G->getGlobal(); 2264 if (!GV->hasDLLImportLinkage()) { 2265 unsigned char OpFlags = 0; 2266 bool ExtraLoad = false; 2267 unsigned WrapperKind = ISD::DELETED_NODE; 2268 2269 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2270 // external symbols most go through the PLT in PIC mode. If the symbol 2271 // has hidden or protected visibility, or if it is static or local, then 2272 // we don't need to use the PLT - we can directly call it. 2273 if (Subtarget->isTargetELF() && 2274 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2275 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2276 OpFlags = X86II::MO_PLT; 2277 } else if (Subtarget->isPICStyleStubAny() && 2278 (GV->isDeclaration() || GV->isWeakForLinker()) && 2279 (!Subtarget->getTargetTriple().isMacOSX() || 2280 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2281 // PC-relative references to external symbols should go through $stub, 2282 // unless we're building with the leopard linker or later, which 2283 // automatically synthesizes these stubs. 2284 OpFlags = X86II::MO_DARWIN_STUB; 2285 } else if (Subtarget->isPICStyleRIPRel() && 2286 isa<Function>(GV) && 2287 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2288 // If the function is marked as non-lazy, generate an indirect call 2289 // which loads from the GOT directly. This avoids runtime overhead 2290 // at the cost of eager binding (and one extra byte of encoding). 2291 OpFlags = X86II::MO_GOTPCREL; 2292 WrapperKind = X86ISD::WrapperRIP; 2293 ExtraLoad = true; 2294 } 2295 2296 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2297 G->getOffset(), OpFlags); 2298 2299 // Add a wrapper if needed. 2300 if (WrapperKind != ISD::DELETED_NODE) 2301 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2302 // Add extra indirection if needed. 2303 if (ExtraLoad) 2304 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2305 MachinePointerInfo::getGOT(), 2306 false, false, 0); 2307 } 2308 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2309 unsigned char OpFlags = 0; 2310 2311 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2312 // external symbols should go through the PLT. 2313 if (Subtarget->isTargetELF() && 2314 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2315 OpFlags = X86II::MO_PLT; 2316 } else if (Subtarget->isPICStyleStubAny() && 2317 (!Subtarget->getTargetTriple().isMacOSX() || 2318 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2319 // PC-relative references to external symbols should go through $stub, 2320 // unless we're building with the leopard linker or later, which 2321 // automatically synthesizes these stubs. 2322 OpFlags = X86II::MO_DARWIN_STUB; 2323 } 2324 2325 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2326 OpFlags); 2327 } 2328 2329 // Returns a chain & a flag for retval copy to use. 2330 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2331 SmallVector<SDValue, 8> Ops; 2332 2333 if (!IsSibcall && isTailCall) { 2334 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2335 DAG.getIntPtrConstant(0, true), InFlag); 2336 InFlag = Chain.getValue(1); 2337 } 2338 2339 Ops.push_back(Chain); 2340 Ops.push_back(Callee); 2341 2342 if (isTailCall) 2343 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2344 2345 // Add argument registers to the end of the list so that they are known live 2346 // into the call. 2347 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2348 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2349 RegsToPass[i].second.getValueType())); 2350 2351 // Add an implicit use GOT pointer in EBX. 2352 if (!isTailCall && Subtarget->isPICStyleGOT()) 2353 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2354 2355 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2356 if (Is64Bit && isVarArg && !IsWin64) 2357 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2358 2359 if (InFlag.getNode()) 2360 Ops.push_back(InFlag); 2361 2362 if (isTailCall) { 2363 // We used to do: 2364 //// If this is the first return lowered for this function, add the regs 2365 //// to the liveout set for the function. 2366 // This isn't right, although it's probably harmless on x86; liveouts 2367 // should be computed from returns not tail calls. Consider a void 2368 // function making a tail call to a function returning int. 2369 return DAG.getNode(X86ISD::TC_RETURN, dl, 2370 NodeTys, &Ops[0], Ops.size()); 2371 } 2372 2373 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2374 InFlag = Chain.getValue(1); 2375 2376 // Create the CALLSEQ_END node. 2377 unsigned NumBytesForCalleeToPush; 2378 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) 2379 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2380 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2381 // If this is a call to a struct-return function, the callee 2382 // pops the hidden struct pointer, so we have to push it back. 2383 // This is common for Darwin/X86, Linux & Mingw32 targets. 2384 NumBytesForCalleeToPush = 4; 2385 else 2386 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2387 2388 // Returns a flag for retval copy to use. 2389 if (!IsSibcall) { 2390 Chain = DAG.getCALLSEQ_END(Chain, 2391 DAG.getIntPtrConstant(NumBytes, true), 2392 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2393 true), 2394 InFlag); 2395 InFlag = Chain.getValue(1); 2396 } 2397 2398 // Handle result values, copying them out of physregs into vregs that we 2399 // return. 2400 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2401 Ins, dl, DAG, InVals); 2402} 2403 2404 2405//===----------------------------------------------------------------------===// 2406// Fast Calling Convention (tail call) implementation 2407//===----------------------------------------------------------------------===// 2408 2409// Like std call, callee cleans arguments, convention except that ECX is 2410// reserved for storing the tail called function address. Only 2 registers are 2411// free for argument passing (inreg). Tail call optimization is performed 2412// provided: 2413// * tailcallopt is enabled 2414// * caller/callee are fastcc 2415// On X86_64 architecture with GOT-style position independent code only local 2416// (within module) calls are supported at the moment. 2417// To keep the stack aligned according to platform abi the function 2418// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2419// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2420// If a tail called function callee has more arguments than the caller the 2421// caller needs to make sure that there is room to move the RETADDR to. This is 2422// achieved by reserving an area the size of the argument delta right after the 2423// original REtADDR, but before the saved framepointer or the spilled registers 2424// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2425// stack layout: 2426// arg1 2427// arg2 2428// RETADDR 2429// [ new RETADDR 2430// move area ] 2431// (possible EBP) 2432// ESI 2433// EDI 2434// local1 .. 2435 2436/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2437/// for a 16 byte align requirement. 2438unsigned 2439X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2440 SelectionDAG& DAG) const { 2441 MachineFunction &MF = DAG.getMachineFunction(); 2442 const TargetMachine &TM = MF.getTarget(); 2443 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2444 unsigned StackAlignment = TFI.getStackAlignment(); 2445 uint64_t AlignMask = StackAlignment - 1; 2446 int64_t Offset = StackSize; 2447 uint64_t SlotSize = TD->getPointerSize(); 2448 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2449 // Number smaller than 12 so just add the difference. 2450 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2451 } else { 2452 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2453 Offset = ((~AlignMask) & Offset) + StackAlignment + 2454 (StackAlignment-SlotSize); 2455 } 2456 return Offset; 2457} 2458 2459/// MatchingStackOffset - Return true if the given stack call argument is 2460/// already available in the same position (relatively) of the caller's 2461/// incoming argument stack. 2462static 2463bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2464 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2465 const X86InstrInfo *TII) { 2466 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2467 int FI = INT_MAX; 2468 if (Arg.getOpcode() == ISD::CopyFromReg) { 2469 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2470 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2471 return false; 2472 MachineInstr *Def = MRI->getVRegDef(VR); 2473 if (!Def) 2474 return false; 2475 if (!Flags.isByVal()) { 2476 if (!TII->isLoadFromStackSlot(Def, FI)) 2477 return false; 2478 } else { 2479 unsigned Opcode = Def->getOpcode(); 2480 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2481 Def->getOperand(1).isFI()) { 2482 FI = Def->getOperand(1).getIndex(); 2483 Bytes = Flags.getByValSize(); 2484 } else 2485 return false; 2486 } 2487 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2488 if (Flags.isByVal()) 2489 // ByVal argument is passed in as a pointer but it's now being 2490 // dereferenced. e.g. 2491 // define @foo(%struct.X* %A) { 2492 // tail call @bar(%struct.X* byval %A) 2493 // } 2494 return false; 2495 SDValue Ptr = Ld->getBasePtr(); 2496 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2497 if (!FINode) 2498 return false; 2499 FI = FINode->getIndex(); 2500 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2501 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2502 FI = FINode->getIndex(); 2503 Bytes = Flags.getByValSize(); 2504 } else 2505 return false; 2506 2507 assert(FI != INT_MAX); 2508 if (!MFI->isFixedObjectIndex(FI)) 2509 return false; 2510 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2511} 2512 2513/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2514/// for tail call optimization. Targets which want to do tail call 2515/// optimization should implement this function. 2516bool 2517X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2518 CallingConv::ID CalleeCC, 2519 bool isVarArg, 2520 bool isCalleeStructRet, 2521 bool isCallerStructRet, 2522 const SmallVectorImpl<ISD::OutputArg> &Outs, 2523 const SmallVectorImpl<SDValue> &OutVals, 2524 const SmallVectorImpl<ISD::InputArg> &Ins, 2525 SelectionDAG& DAG) const { 2526 if (!IsTailCallConvention(CalleeCC) && 2527 CalleeCC != CallingConv::C) 2528 return false; 2529 2530 // If -tailcallopt is specified, make fastcc functions tail-callable. 2531 const MachineFunction &MF = DAG.getMachineFunction(); 2532 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2533 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2534 bool CCMatch = CallerCC == CalleeCC; 2535 2536 if (GuaranteedTailCallOpt) { 2537 if (IsTailCallConvention(CalleeCC) && CCMatch) 2538 return true; 2539 return false; 2540 } 2541 2542 // Look for obvious safe cases to perform tail call optimization that do not 2543 // require ABI changes. This is what gcc calls sibcall. 2544 2545 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2546 // emit a special epilogue. 2547 if (RegInfo->needsStackRealignment(MF)) 2548 return false; 2549 2550 // Also avoid sibcall optimization if either caller or callee uses struct 2551 // return semantics. 2552 if (isCalleeStructRet || isCallerStructRet) 2553 return false; 2554 2555 // An stdcall caller is expected to clean up its arguments; the callee 2556 // isn't going to do that. 2557 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2558 return false; 2559 2560 // Do not sibcall optimize vararg calls unless all arguments are passed via 2561 // registers. 2562 if (isVarArg && !Outs.empty()) { 2563 2564 // Optimizing for varargs on Win64 is unlikely to be safe without 2565 // additional testing. 2566 if (Subtarget->isTargetWin64()) 2567 return false; 2568 2569 SmallVector<CCValAssign, 16> ArgLocs; 2570 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2571 getTargetMachine(), ArgLocs, *DAG.getContext()); 2572 2573 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2574 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2575 if (!ArgLocs[i].isRegLoc()) 2576 return false; 2577 } 2578 2579 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2580 // Therefore if it's not used by the call it is not safe to optimize this into 2581 // a sibcall. 2582 bool Unused = false; 2583 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2584 if (!Ins[i].Used) { 2585 Unused = true; 2586 break; 2587 } 2588 } 2589 if (Unused) { 2590 SmallVector<CCValAssign, 16> RVLocs; 2591 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2592 getTargetMachine(), RVLocs, *DAG.getContext()); 2593 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2594 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2595 CCValAssign &VA = RVLocs[i]; 2596 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2597 return false; 2598 } 2599 } 2600 2601 // If the calling conventions do not match, then we'd better make sure the 2602 // results are returned in the same way as what the caller expects. 2603 if (!CCMatch) { 2604 SmallVector<CCValAssign, 16> RVLocs1; 2605 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2606 getTargetMachine(), RVLocs1, *DAG.getContext()); 2607 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2608 2609 SmallVector<CCValAssign, 16> RVLocs2; 2610 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2611 getTargetMachine(), RVLocs2, *DAG.getContext()); 2612 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2613 2614 if (RVLocs1.size() != RVLocs2.size()) 2615 return false; 2616 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2617 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2618 return false; 2619 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2620 return false; 2621 if (RVLocs1[i].isRegLoc()) { 2622 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2623 return false; 2624 } else { 2625 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2626 return false; 2627 } 2628 } 2629 } 2630 2631 // If the callee takes no arguments then go on to check the results of the 2632 // call. 2633 if (!Outs.empty()) { 2634 // Check if stack adjustment is needed. For now, do not do this if any 2635 // argument is passed on the stack. 2636 SmallVector<CCValAssign, 16> ArgLocs; 2637 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2638 getTargetMachine(), ArgLocs, *DAG.getContext()); 2639 2640 // Allocate shadow area for Win64 2641 if (Subtarget->isTargetWin64()) { 2642 CCInfo.AllocateStack(32, 8); 2643 } 2644 2645 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2646 if (CCInfo.getNextStackOffset()) { 2647 MachineFunction &MF = DAG.getMachineFunction(); 2648 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2649 return false; 2650 2651 // Check if the arguments are already laid out in the right way as 2652 // the caller's fixed stack objects. 2653 MachineFrameInfo *MFI = MF.getFrameInfo(); 2654 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2655 const X86InstrInfo *TII = 2656 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2657 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2658 CCValAssign &VA = ArgLocs[i]; 2659 SDValue Arg = OutVals[i]; 2660 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2661 if (VA.getLocInfo() == CCValAssign::Indirect) 2662 return false; 2663 if (!VA.isRegLoc()) { 2664 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2665 MFI, MRI, TII)) 2666 return false; 2667 } 2668 } 2669 } 2670 2671 // If the tailcall address may be in a register, then make sure it's 2672 // possible to register allocate for it. In 32-bit, the call address can 2673 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2674 // callee-saved registers are restored. These happen to be the same 2675 // registers used to pass 'inreg' arguments so watch out for those. 2676 if (!Subtarget->is64Bit() && 2677 !isa<GlobalAddressSDNode>(Callee) && 2678 !isa<ExternalSymbolSDNode>(Callee)) { 2679 unsigned NumInRegs = 0; 2680 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2681 CCValAssign &VA = ArgLocs[i]; 2682 if (!VA.isRegLoc()) 2683 continue; 2684 unsigned Reg = VA.getLocReg(); 2685 switch (Reg) { 2686 default: break; 2687 case X86::EAX: case X86::EDX: case X86::ECX: 2688 if (++NumInRegs == 3) 2689 return false; 2690 break; 2691 } 2692 } 2693 } 2694 } 2695 2696 return true; 2697} 2698 2699FastISel * 2700X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2701 return X86::createFastISel(funcInfo); 2702} 2703 2704 2705//===----------------------------------------------------------------------===// 2706// Other Lowering Hooks 2707//===----------------------------------------------------------------------===// 2708 2709static bool MayFoldLoad(SDValue Op) { 2710 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2711} 2712 2713static bool MayFoldIntoStore(SDValue Op) { 2714 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2715} 2716 2717static bool isTargetShuffle(unsigned Opcode) { 2718 switch(Opcode) { 2719 default: return false; 2720 case X86ISD::PSHUFD: 2721 case X86ISD::PSHUFHW: 2722 case X86ISD::PSHUFLW: 2723 case X86ISD::SHUFPD: 2724 case X86ISD::PALIGN: 2725 case X86ISD::SHUFPS: 2726 case X86ISD::MOVLHPS: 2727 case X86ISD::MOVLHPD: 2728 case X86ISD::MOVHLPS: 2729 case X86ISD::MOVLPS: 2730 case X86ISD::MOVLPD: 2731 case X86ISD::MOVSHDUP: 2732 case X86ISD::MOVSLDUP: 2733 case X86ISD::MOVDDUP: 2734 case X86ISD::MOVSS: 2735 case X86ISD::MOVSD: 2736 case X86ISD::UNPCKLPS: 2737 case X86ISD::UNPCKLPD: 2738 case X86ISD::VUNPCKLPSY: 2739 case X86ISD::VUNPCKLPDY: 2740 case X86ISD::PUNPCKLWD: 2741 case X86ISD::PUNPCKLBW: 2742 case X86ISD::PUNPCKLDQ: 2743 case X86ISD::PUNPCKLQDQ: 2744 case X86ISD::UNPCKHPS: 2745 case X86ISD::UNPCKHPD: 2746 case X86ISD::VUNPCKHPSY: 2747 case X86ISD::VUNPCKHPDY: 2748 case X86ISD::PUNPCKHWD: 2749 case X86ISD::PUNPCKHBW: 2750 case X86ISD::PUNPCKHDQ: 2751 case X86ISD::PUNPCKHQDQ: 2752 case X86ISD::VPERMILPS: 2753 case X86ISD::VPERMILPSY: 2754 case X86ISD::VPERMILPD: 2755 case X86ISD::VPERMILPDY: 2756 case X86ISD::VPERM2F128: 2757 return true; 2758 } 2759 return false; 2760} 2761 2762static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2763 SDValue V1, SelectionDAG &DAG) { 2764 switch(Opc) { 2765 default: llvm_unreachable("Unknown x86 shuffle node"); 2766 case X86ISD::MOVSHDUP: 2767 case X86ISD::MOVSLDUP: 2768 case X86ISD::MOVDDUP: 2769 return DAG.getNode(Opc, dl, VT, V1); 2770 } 2771 2772 return SDValue(); 2773} 2774 2775static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2776 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2777 switch(Opc) { 2778 default: llvm_unreachable("Unknown x86 shuffle node"); 2779 case X86ISD::PSHUFD: 2780 case X86ISD::PSHUFHW: 2781 case X86ISD::PSHUFLW: 2782 case X86ISD::VPERMILPS: 2783 case X86ISD::VPERMILPSY: 2784 case X86ISD::VPERMILPD: 2785 case X86ISD::VPERMILPDY: 2786 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2787 } 2788 2789 return SDValue(); 2790} 2791 2792static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2793 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2794 switch(Opc) { 2795 default: llvm_unreachable("Unknown x86 shuffle node"); 2796 case X86ISD::PALIGN: 2797 case X86ISD::SHUFPD: 2798 case X86ISD::SHUFPS: 2799 case X86ISD::VPERM2F128: 2800 return DAG.getNode(Opc, dl, VT, V1, V2, 2801 DAG.getConstant(TargetMask, MVT::i8)); 2802 } 2803 return SDValue(); 2804} 2805 2806static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2807 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2808 switch(Opc) { 2809 default: llvm_unreachable("Unknown x86 shuffle node"); 2810 case X86ISD::MOVLHPS: 2811 case X86ISD::MOVLHPD: 2812 case X86ISD::MOVHLPS: 2813 case X86ISD::MOVLPS: 2814 case X86ISD::MOVLPD: 2815 case X86ISD::MOVSS: 2816 case X86ISD::MOVSD: 2817 case X86ISD::UNPCKLPS: 2818 case X86ISD::UNPCKLPD: 2819 case X86ISD::VUNPCKLPSY: 2820 case X86ISD::VUNPCKLPDY: 2821 case X86ISD::PUNPCKLWD: 2822 case X86ISD::PUNPCKLBW: 2823 case X86ISD::PUNPCKLDQ: 2824 case X86ISD::PUNPCKLQDQ: 2825 case X86ISD::UNPCKHPS: 2826 case X86ISD::UNPCKHPD: 2827 case X86ISD::VUNPCKHPSY: 2828 case X86ISD::VUNPCKHPDY: 2829 case X86ISD::PUNPCKHWD: 2830 case X86ISD::PUNPCKHBW: 2831 case X86ISD::PUNPCKHDQ: 2832 case X86ISD::PUNPCKHQDQ: 2833 return DAG.getNode(Opc, dl, VT, V1, V2); 2834 } 2835 return SDValue(); 2836} 2837 2838SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2839 MachineFunction &MF = DAG.getMachineFunction(); 2840 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2841 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2842 2843 if (ReturnAddrIndex == 0) { 2844 // Set up a frame object for the return address. 2845 uint64_t SlotSize = TD->getPointerSize(); 2846 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2847 false); 2848 FuncInfo->setRAIndex(ReturnAddrIndex); 2849 } 2850 2851 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2852} 2853 2854 2855bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2856 bool hasSymbolicDisplacement) { 2857 // Offset should fit into 32 bit immediate field. 2858 if (!isInt<32>(Offset)) 2859 return false; 2860 2861 // If we don't have a symbolic displacement - we don't have any extra 2862 // restrictions. 2863 if (!hasSymbolicDisplacement) 2864 return true; 2865 2866 // FIXME: Some tweaks might be needed for medium code model. 2867 if (M != CodeModel::Small && M != CodeModel::Kernel) 2868 return false; 2869 2870 // For small code model we assume that latest object is 16MB before end of 31 2871 // bits boundary. We may also accept pretty large negative constants knowing 2872 // that all objects are in the positive half of address space. 2873 if (M == CodeModel::Small && Offset < 16*1024*1024) 2874 return true; 2875 2876 // For kernel code model we know that all object resist in the negative half 2877 // of 32bits address space. We may not accept negative offsets, since they may 2878 // be just off and we may accept pretty large positive ones. 2879 if (M == CodeModel::Kernel && Offset > 0) 2880 return true; 2881 2882 return false; 2883} 2884 2885/// isCalleePop - Determines whether the callee is required to pop its 2886/// own arguments. Callee pop is necessary to support tail calls. 2887bool X86::isCalleePop(CallingConv::ID CallingConv, 2888 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 2889 if (IsVarArg) 2890 return false; 2891 2892 switch (CallingConv) { 2893 default: 2894 return false; 2895 case CallingConv::X86_StdCall: 2896 return !is64Bit; 2897 case CallingConv::X86_FastCall: 2898 return !is64Bit; 2899 case CallingConv::X86_ThisCall: 2900 return !is64Bit; 2901 case CallingConv::Fast: 2902 return TailCallOpt; 2903 case CallingConv::GHC: 2904 return TailCallOpt; 2905 } 2906} 2907 2908/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2909/// specific condition code, returning the condition code and the LHS/RHS of the 2910/// comparison to make. 2911static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2912 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2913 if (!isFP) { 2914 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2915 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2916 // X > -1 -> X == 0, jump !sign. 2917 RHS = DAG.getConstant(0, RHS.getValueType()); 2918 return X86::COND_NS; 2919 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2920 // X < 0 -> X == 0, jump on sign. 2921 return X86::COND_S; 2922 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2923 // X < 1 -> X <= 0 2924 RHS = DAG.getConstant(0, RHS.getValueType()); 2925 return X86::COND_LE; 2926 } 2927 } 2928 2929 switch (SetCCOpcode) { 2930 default: llvm_unreachable("Invalid integer condition!"); 2931 case ISD::SETEQ: return X86::COND_E; 2932 case ISD::SETGT: return X86::COND_G; 2933 case ISD::SETGE: return X86::COND_GE; 2934 case ISD::SETLT: return X86::COND_L; 2935 case ISD::SETLE: return X86::COND_LE; 2936 case ISD::SETNE: return X86::COND_NE; 2937 case ISD::SETULT: return X86::COND_B; 2938 case ISD::SETUGT: return X86::COND_A; 2939 case ISD::SETULE: return X86::COND_BE; 2940 case ISD::SETUGE: return X86::COND_AE; 2941 } 2942 } 2943 2944 // First determine if it is required or is profitable to flip the operands. 2945 2946 // If LHS is a foldable load, but RHS is not, flip the condition. 2947 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2948 !ISD::isNON_EXTLoad(RHS.getNode())) { 2949 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2950 std::swap(LHS, RHS); 2951 } 2952 2953 switch (SetCCOpcode) { 2954 default: break; 2955 case ISD::SETOLT: 2956 case ISD::SETOLE: 2957 case ISD::SETUGT: 2958 case ISD::SETUGE: 2959 std::swap(LHS, RHS); 2960 break; 2961 } 2962 2963 // On a floating point condition, the flags are set as follows: 2964 // ZF PF CF op 2965 // 0 | 0 | 0 | X > Y 2966 // 0 | 0 | 1 | X < Y 2967 // 1 | 0 | 0 | X == Y 2968 // 1 | 1 | 1 | unordered 2969 switch (SetCCOpcode) { 2970 default: llvm_unreachable("Condcode should be pre-legalized away"); 2971 case ISD::SETUEQ: 2972 case ISD::SETEQ: return X86::COND_E; 2973 case ISD::SETOLT: // flipped 2974 case ISD::SETOGT: 2975 case ISD::SETGT: return X86::COND_A; 2976 case ISD::SETOLE: // flipped 2977 case ISD::SETOGE: 2978 case ISD::SETGE: return X86::COND_AE; 2979 case ISD::SETUGT: // flipped 2980 case ISD::SETULT: 2981 case ISD::SETLT: return X86::COND_B; 2982 case ISD::SETUGE: // flipped 2983 case ISD::SETULE: 2984 case ISD::SETLE: return X86::COND_BE; 2985 case ISD::SETONE: 2986 case ISD::SETNE: return X86::COND_NE; 2987 case ISD::SETUO: return X86::COND_P; 2988 case ISD::SETO: return X86::COND_NP; 2989 case ISD::SETOEQ: 2990 case ISD::SETUNE: return X86::COND_INVALID; 2991 } 2992} 2993 2994/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2995/// code. Current x86 isa includes the following FP cmov instructions: 2996/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2997static bool hasFPCMov(unsigned X86CC) { 2998 switch (X86CC) { 2999 default: 3000 return false; 3001 case X86::COND_B: 3002 case X86::COND_BE: 3003 case X86::COND_E: 3004 case X86::COND_P: 3005 case X86::COND_A: 3006 case X86::COND_AE: 3007 case X86::COND_NE: 3008 case X86::COND_NP: 3009 return true; 3010 } 3011} 3012 3013/// isFPImmLegal - Returns true if the target can instruction select the 3014/// specified FP immediate natively. If false, the legalizer will 3015/// materialize the FP immediate as a load from a constant pool. 3016bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3017 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3018 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3019 return true; 3020 } 3021 return false; 3022} 3023 3024/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3025/// the specified range (L, H]. 3026static bool isUndefOrInRange(int Val, int Low, int Hi) { 3027 return (Val < 0) || (Val >= Low && Val < Hi); 3028} 3029 3030/// isUndefOrInRange - Return true if every element in Mask, begining 3031/// from position Pos and ending in Pos+Size, falls within the specified 3032/// range (L, L+Pos]. or is undef. 3033static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, 3034 int Pos, int Size, int Low, int Hi) { 3035 for (int i = Pos, e = Pos+Size; i != e; ++i) 3036 if (!isUndefOrInRange(Mask[i], Low, Hi)) 3037 return false; 3038 return true; 3039} 3040 3041/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3042/// specified value. 3043static bool isUndefOrEqual(int Val, int CmpVal) { 3044 if (Val < 0 || Val == CmpVal) 3045 return true; 3046 return false; 3047} 3048 3049/// isSequentialOrUndefInRange - Return true if every element in Mask, begining 3050/// from position Pos and ending in Pos+Size, falls within the specified 3051/// sequential range (L, L+Pos]. or is undef. 3052static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, 3053 int Pos, int Size, int Low) { 3054 for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3055 if (!isUndefOrEqual(Mask[i], Low)) 3056 return false; 3057 return true; 3058} 3059 3060/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3061/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3062/// the second operand. 3063static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3064 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3065 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3066 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3067 return (Mask[0] < 2 && Mask[1] < 2); 3068 return false; 3069} 3070 3071bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3072 SmallVector<int, 8> M; 3073 N->getMask(M); 3074 return ::isPSHUFDMask(M, N->getValueType(0)); 3075} 3076 3077/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3078/// is suitable for input to PSHUFHW. 3079static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3080 if (VT != MVT::v8i16) 3081 return false; 3082 3083 // Lower quadword copied in order or undef. 3084 for (int i = 0; i != 4; ++i) 3085 if (Mask[i] >= 0 && Mask[i] != i) 3086 return false; 3087 3088 // Upper quadword shuffled. 3089 for (int i = 4; i != 8; ++i) 3090 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3091 return false; 3092 3093 return true; 3094} 3095 3096bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3097 SmallVector<int, 8> M; 3098 N->getMask(M); 3099 return ::isPSHUFHWMask(M, N->getValueType(0)); 3100} 3101 3102/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3103/// is suitable for input to PSHUFLW. 3104static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3105 if (VT != MVT::v8i16) 3106 return false; 3107 3108 // Upper quadword copied in order. 3109 for (int i = 4; i != 8; ++i) 3110 if (Mask[i] >= 0 && Mask[i] != i) 3111 return false; 3112 3113 // Lower quadword shuffled. 3114 for (int i = 0; i != 4; ++i) 3115 if (Mask[i] >= 4) 3116 return false; 3117 3118 return true; 3119} 3120 3121bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3122 SmallVector<int, 8> M; 3123 N->getMask(M); 3124 return ::isPSHUFLWMask(M, N->getValueType(0)); 3125} 3126 3127/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3128/// is suitable for input to PALIGNR. 3129static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3130 bool hasSSSE3) { 3131 int i, e = VT.getVectorNumElements(); 3132 if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) 3133 return false; 3134 3135 // Do not handle v2i64 / v2f64 shuffles with palignr. 3136 if (e < 4 || !hasSSSE3) 3137 return false; 3138 3139 for (i = 0; i != e; ++i) 3140 if (Mask[i] >= 0) 3141 break; 3142 3143 // All undef, not a palignr. 3144 if (i == e) 3145 return false; 3146 3147 // Make sure we're shifting in the right direction. 3148 if (Mask[i] <= i) 3149 return false; 3150 3151 int s = Mask[i] - i; 3152 3153 // Check the rest of the elements to see if they are consecutive. 3154 for (++i; i != e; ++i) { 3155 int m = Mask[i]; 3156 if (m >= 0 && m != s+i) 3157 return false; 3158 } 3159 return true; 3160} 3161 3162/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3163/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3164static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3165 int NumElems = VT.getVectorNumElements(); 3166 if (NumElems != 2 && NumElems != 4) 3167 return false; 3168 3169 int Half = NumElems / 2; 3170 for (int i = 0; i < Half; ++i) 3171 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3172 return false; 3173 for (int i = Half; i < NumElems; ++i) 3174 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3175 return false; 3176 3177 return true; 3178} 3179 3180bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3181 SmallVector<int, 8> M; 3182 N->getMask(M); 3183 return ::isSHUFPMask(M, N->getValueType(0)); 3184} 3185 3186/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3187/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3188/// half elements to come from vector 1 (which would equal the dest.) and 3189/// the upper half to come from vector 2. 3190static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3191 int NumElems = VT.getVectorNumElements(); 3192 3193 if (NumElems != 2 && NumElems != 4) 3194 return false; 3195 3196 int Half = NumElems / 2; 3197 for (int i = 0; i < Half; ++i) 3198 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3199 return false; 3200 for (int i = Half; i < NumElems; ++i) 3201 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3202 return false; 3203 return true; 3204} 3205 3206static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3207 SmallVector<int, 8> M; 3208 N->getMask(M); 3209 return isCommutedSHUFPMask(M, N->getValueType(0)); 3210} 3211 3212/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3213/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3214bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3215 EVT VT = N->getValueType(0); 3216 unsigned NumElems = VT.getVectorNumElements(); 3217 3218 if (VT.getSizeInBits() != 128) 3219 return false; 3220 3221 if (NumElems != 4) 3222 return false; 3223 3224 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3225 return isUndefOrEqual(N->getMaskElt(0), 6) && 3226 isUndefOrEqual(N->getMaskElt(1), 7) && 3227 isUndefOrEqual(N->getMaskElt(2), 2) && 3228 isUndefOrEqual(N->getMaskElt(3), 3); 3229} 3230 3231/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3232/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3233/// <2, 3, 2, 3> 3234bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3235 EVT VT = N->getValueType(0); 3236 unsigned NumElems = VT.getVectorNumElements(); 3237 3238 if (VT.getSizeInBits() != 128) 3239 return false; 3240 3241 if (NumElems != 4) 3242 return false; 3243 3244 return isUndefOrEqual(N->getMaskElt(0), 2) && 3245 isUndefOrEqual(N->getMaskElt(1), 3) && 3246 isUndefOrEqual(N->getMaskElt(2), 2) && 3247 isUndefOrEqual(N->getMaskElt(3), 3); 3248} 3249 3250/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3251/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3252bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3253 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3254 3255 if (NumElems != 2 && NumElems != 4) 3256 return false; 3257 3258 for (unsigned i = 0; i < NumElems/2; ++i) 3259 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3260 return false; 3261 3262 for (unsigned i = NumElems/2; i < NumElems; ++i) 3263 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3264 return false; 3265 3266 return true; 3267} 3268 3269/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3270/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3271bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3272 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3273 3274 if ((NumElems != 2 && NumElems != 4) 3275 || N->getValueType(0).getSizeInBits() > 128) 3276 return false; 3277 3278 for (unsigned i = 0; i < NumElems/2; ++i) 3279 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3280 return false; 3281 3282 for (unsigned i = 0; i < NumElems/2; ++i) 3283 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3284 return false; 3285 3286 return true; 3287} 3288 3289/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3290/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3291static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3292 bool V2IsSplat = false) { 3293 int NumElts = VT.getVectorNumElements(); 3294 3295 assert((VT.is128BitVector() || VT.is256BitVector()) && 3296 "Unsupported vector type for unpckh"); 3297 3298 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3299 return false; 3300 3301 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3302 // independently on 128-bit lanes. 3303 unsigned NumLanes = VT.getSizeInBits()/128; 3304 unsigned NumLaneElts = NumElts/NumLanes; 3305 3306 unsigned Start = 0; 3307 unsigned End = NumLaneElts; 3308 for (unsigned s = 0; s < NumLanes; ++s) { 3309 for (unsigned i = Start, j = s * NumLaneElts; 3310 i != End; 3311 i += 2, ++j) { 3312 int BitI = Mask[i]; 3313 int BitI1 = Mask[i+1]; 3314 if (!isUndefOrEqual(BitI, j)) 3315 return false; 3316 if (V2IsSplat) { 3317 if (!isUndefOrEqual(BitI1, NumElts)) 3318 return false; 3319 } else { 3320 if (!isUndefOrEqual(BitI1, j + NumElts)) 3321 return false; 3322 } 3323 } 3324 // Process the next 128 bits. 3325 Start += NumLaneElts; 3326 End += NumLaneElts; 3327 } 3328 3329 return true; 3330} 3331 3332bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3333 SmallVector<int, 8> M; 3334 N->getMask(M); 3335 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3336} 3337 3338/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3339/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3340static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3341 bool V2IsSplat = false) { 3342 int NumElts = VT.getVectorNumElements(); 3343 3344 assert((VT.is128BitVector() || VT.is256BitVector()) && 3345 "Unsupported vector type for unpckh"); 3346 3347 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3348 return false; 3349 3350 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3351 // independently on 128-bit lanes. 3352 unsigned NumLanes = VT.getSizeInBits()/128; 3353 unsigned NumLaneElts = NumElts/NumLanes; 3354 3355 unsigned Start = 0; 3356 unsigned End = NumLaneElts; 3357 for (unsigned l = 0; l != NumLanes; ++l) { 3358 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; 3359 i != End; i += 2, ++j) { 3360 int BitI = Mask[i]; 3361 int BitI1 = Mask[i+1]; 3362 if (!isUndefOrEqual(BitI, j)) 3363 return false; 3364 if (V2IsSplat) { 3365 if (isUndefOrEqual(BitI1, NumElts)) 3366 return false; 3367 } else { 3368 if (!isUndefOrEqual(BitI1, j+NumElts)) 3369 return false; 3370 } 3371 } 3372 // Process the next 128 bits. 3373 Start += NumLaneElts; 3374 End += NumLaneElts; 3375 } 3376 return true; 3377} 3378 3379bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3380 SmallVector<int, 8> M; 3381 N->getMask(M); 3382 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3383} 3384 3385/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3386/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3387/// <0, 0, 1, 1> 3388static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3389 int NumElems = VT.getVectorNumElements(); 3390 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3391 return false; 3392 3393 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3394 // independently on 128-bit lanes. 3395 unsigned NumLanes = VT.getSizeInBits() / 128; 3396 unsigned NumLaneElts = NumElems / NumLanes; 3397 3398 for (unsigned s = 0; s < NumLanes; ++s) { 3399 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; 3400 i != NumLaneElts * (s + 1); 3401 i += 2, ++j) { 3402 int BitI = Mask[i]; 3403 int BitI1 = Mask[i+1]; 3404 3405 if (!isUndefOrEqual(BitI, j)) 3406 return false; 3407 if (!isUndefOrEqual(BitI1, j)) 3408 return false; 3409 } 3410 } 3411 3412 return true; 3413} 3414 3415bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3416 SmallVector<int, 8> M; 3417 N->getMask(M); 3418 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3419} 3420 3421/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3422/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3423/// <2, 2, 3, 3> 3424static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3425 int NumElems = VT.getVectorNumElements(); 3426 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3427 return false; 3428 3429 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3430 int BitI = Mask[i]; 3431 int BitI1 = Mask[i+1]; 3432 if (!isUndefOrEqual(BitI, j)) 3433 return false; 3434 if (!isUndefOrEqual(BitI1, j)) 3435 return false; 3436 } 3437 return true; 3438} 3439 3440bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3441 SmallVector<int, 8> M; 3442 N->getMask(M); 3443 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3444} 3445 3446/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3447/// specifies a shuffle of elements that is suitable for input to MOVSS, 3448/// MOVSD, and MOVD, i.e. setting the lowest element. 3449static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3450 if (VT.getVectorElementType().getSizeInBits() < 32) 3451 return false; 3452 3453 int NumElts = VT.getVectorNumElements(); 3454 3455 if (!isUndefOrEqual(Mask[0], NumElts)) 3456 return false; 3457 3458 for (int i = 1; i < NumElts; ++i) 3459 if (!isUndefOrEqual(Mask[i], i)) 3460 return false; 3461 3462 return true; 3463} 3464 3465bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3466 SmallVector<int, 8> M; 3467 N->getMask(M); 3468 return ::isMOVLMask(M, N->getValueType(0)); 3469} 3470 3471/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered 3472/// as permutations between 128-bit chunks or halves. As an example: this 3473/// shuffle bellow: 3474/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3475/// The first half comes from the second half of V1 and the second half from the 3476/// the second half of V2. 3477static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, 3478 const X86Subtarget *Subtarget) { 3479 if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) 3480 return false; 3481 3482 // The shuffle result is divided into half A and half B. In total the two 3483 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3484 // B must come from C, D, E or F. 3485 int HalfSize = VT.getVectorNumElements()/2; 3486 bool MatchA = false, MatchB = false; 3487 3488 // Check if A comes from one of C, D, E, F. 3489 for (int Half = 0; Half < 4; ++Half) { 3490 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3491 MatchA = true; 3492 break; 3493 } 3494 } 3495 3496 // Check if B comes from one of C, D, E, F. 3497 for (int Half = 0; Half < 4; ++Half) { 3498 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3499 MatchB = true; 3500 break; 3501 } 3502 } 3503 3504 return MatchA && MatchB; 3505} 3506 3507/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle 3508/// the specified VECTOR_MASK mask with VPERM2F128 instructions. 3509static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { 3510 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3511 EVT VT = SVOp->getValueType(0); 3512 3513 int HalfSize = VT.getVectorNumElements()/2; 3514 3515 int FstHalf = 0, SndHalf = 0; 3516 for (int i = 0; i < HalfSize; ++i) { 3517 if (SVOp->getMaskElt(i) > 0) { 3518 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3519 break; 3520 } 3521 } 3522 for (int i = HalfSize; i < HalfSize*2; ++i) { 3523 if (SVOp->getMaskElt(i) > 0) { 3524 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3525 break; 3526 } 3527 } 3528 3529 return (FstHalf | (SndHalf << 4)); 3530} 3531 3532/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand 3533/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3534/// Note that VPERMIL mask matching is different depending whether theunderlying 3535/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3536/// to the same elements of the low, but to the higher half of the source. 3537/// In VPERMILPD the two lanes could be shuffled independently of each other 3538/// with the same restriction that lanes can't be crossed. 3539static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, 3540 const X86Subtarget *Subtarget) { 3541 int NumElts = VT.getVectorNumElements(); 3542 int NumLanes = VT.getSizeInBits()/128; 3543 3544 if (!Subtarget->hasAVX()) 3545 return false; 3546 3547 // Match any permutation of 128-bit vector with 64-bit types 3548 if (NumLanes == 1 && NumElts != 2) 3549 return false; 3550 3551 // Only match 256-bit with 32 types 3552 if (VT.getSizeInBits() == 256 && NumElts != 4) 3553 return false; 3554 3555 // The mask on the high lane is independent of the low. Both can match 3556 // any element in inside its own lane, but can't cross. 3557 int LaneSize = NumElts/NumLanes; 3558 for (int l = 0; l < NumLanes; ++l) 3559 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3560 int LaneStart = l*LaneSize; 3561 if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) 3562 return false; 3563 } 3564 3565 return true; 3566} 3567 3568/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand 3569/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. 3570/// Note that VPERMIL mask matching is different depending whether theunderlying 3571/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3572/// to the same elements of the low, but to the higher half of the source. 3573/// In VPERMILPD the two lanes could be shuffled independently of each other 3574/// with the same restriction that lanes can't be crossed. 3575static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, 3576 const X86Subtarget *Subtarget) { 3577 unsigned NumElts = VT.getVectorNumElements(); 3578 unsigned NumLanes = VT.getSizeInBits()/128; 3579 3580 if (!Subtarget->hasAVX()) 3581 return false; 3582 3583 // Match any permutation of 128-bit vector with 32-bit types 3584 if (NumLanes == 1 && NumElts != 4) 3585 return false; 3586 3587 // Only match 256-bit with 32 types 3588 if (VT.getSizeInBits() == 256 && NumElts != 8) 3589 return false; 3590 3591 // The mask on the high lane should be the same as the low. Actually, 3592 // they can differ if any of the corresponding index in a lane is undef 3593 // and the other stays in range. 3594 int LaneSize = NumElts/NumLanes; 3595 for (int i = 0; i < LaneSize; ++i) { 3596 int HighElt = i+LaneSize; 3597 bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); 3598 bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); 3599 3600 if (!HighValid || !LowValid) 3601 return false; 3602 if (Mask[i] < 0 || Mask[HighElt] < 0) 3603 continue; 3604 if (Mask[HighElt]-Mask[i] != LaneSize) 3605 return false; 3606 } 3607 3608 return true; 3609} 3610 3611/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle 3612/// the specified VECTOR_MASK mask with VPERMILPS* instructions. 3613static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { 3614 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3615 EVT VT = SVOp->getValueType(0); 3616 3617 int NumElts = VT.getVectorNumElements(); 3618 int NumLanes = VT.getSizeInBits()/128; 3619 int LaneSize = NumElts/NumLanes; 3620 3621 // Although the mask is equal for both lanes do it twice to get the cases 3622 // where a mask will match because the same mask element is undef on the 3623 // first half but valid on the second. This would get pathological cases 3624 // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. 3625 unsigned Mask = 0; 3626 for (int l = 0; l < NumLanes; ++l) { 3627 for (int i = 0; i < LaneSize; ++i) { 3628 int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); 3629 if (MaskElt < 0) 3630 continue; 3631 if (MaskElt >= LaneSize) 3632 MaskElt -= LaneSize; 3633 Mask |= MaskElt << (i*2); 3634 } 3635 } 3636 3637 return Mask; 3638} 3639 3640/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle 3641/// the specified VECTOR_MASK mask with VPERMILPD* instructions. 3642static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { 3643 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3644 EVT VT = SVOp->getValueType(0); 3645 3646 int NumElts = VT.getVectorNumElements(); 3647 int NumLanes = VT.getSizeInBits()/128; 3648 3649 unsigned Mask = 0; 3650 int LaneSize = NumElts/NumLanes; 3651 for (int l = 0; l < NumLanes; ++l) 3652 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3653 int MaskElt = SVOp->getMaskElt(i); 3654 if (MaskElt < 0) 3655 continue; 3656 Mask |= (MaskElt-l*LaneSize) << i; 3657 } 3658 3659 return Mask; 3660} 3661 3662/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3663/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3664/// element of vector 2 and the other elements to come from vector 1 in order. 3665static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3666 bool V2IsSplat = false, bool V2IsUndef = false) { 3667 int NumOps = VT.getVectorNumElements(); 3668 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3669 return false; 3670 3671 if (!isUndefOrEqual(Mask[0], 0)) 3672 return false; 3673 3674 for (int i = 1; i < NumOps; ++i) 3675 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3676 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3677 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3678 return false; 3679 3680 return true; 3681} 3682 3683static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3684 bool V2IsUndef = false) { 3685 SmallVector<int, 8> M; 3686 N->getMask(M); 3687 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3688} 3689 3690/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3691/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3692/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3693bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3694 const X86Subtarget *Subtarget) { 3695 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3696 return false; 3697 3698 // The second vector must be undef 3699 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3700 return false; 3701 3702 EVT VT = N->getValueType(0); 3703 unsigned NumElems = VT.getVectorNumElements(); 3704 3705 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3706 (VT.getSizeInBits() == 256 && NumElems != 8)) 3707 return false; 3708 3709 // "i+1" is the value the indexed mask element must have 3710 for (unsigned i = 0; i < NumElems; i += 2) 3711 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3712 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3713 return false; 3714 3715 return true; 3716} 3717 3718/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3719/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3720/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3721bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3722 const X86Subtarget *Subtarget) { 3723 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3724 return false; 3725 3726 // The second vector must be undef 3727 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3728 return false; 3729 3730 EVT VT = N->getValueType(0); 3731 unsigned NumElems = VT.getVectorNumElements(); 3732 3733 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3734 (VT.getSizeInBits() == 256 && NumElems != 8)) 3735 return false; 3736 3737 // "i" is the value the indexed mask element must have 3738 for (unsigned i = 0; i < NumElems; i += 2) 3739 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3740 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3741 return false; 3742 3743 return true; 3744} 3745 3746/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3747/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3748bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3749 int e = N->getValueType(0).getVectorNumElements() / 2; 3750 3751 for (int i = 0; i < e; ++i) 3752 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3753 return false; 3754 for (int i = 0; i < e; ++i) 3755 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3756 return false; 3757 return true; 3758} 3759 3760/// isVEXTRACTF128Index - Return true if the specified 3761/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3762/// suitable for input to VEXTRACTF128. 3763bool X86::isVEXTRACTF128Index(SDNode *N) { 3764 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3765 return false; 3766 3767 // The index should be aligned on a 128-bit boundary. 3768 uint64_t Index = 3769 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3770 3771 unsigned VL = N->getValueType(0).getVectorNumElements(); 3772 unsigned VBits = N->getValueType(0).getSizeInBits(); 3773 unsigned ElSize = VBits / VL; 3774 bool Result = (Index * ElSize) % 128 == 0; 3775 3776 return Result; 3777} 3778 3779/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3780/// operand specifies a subvector insert that is suitable for input to 3781/// VINSERTF128. 3782bool X86::isVINSERTF128Index(SDNode *N) { 3783 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3784 return false; 3785 3786 // The index should be aligned on a 128-bit boundary. 3787 uint64_t Index = 3788 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3789 3790 unsigned VL = N->getValueType(0).getVectorNumElements(); 3791 unsigned VBits = N->getValueType(0).getSizeInBits(); 3792 unsigned ElSize = VBits / VL; 3793 bool Result = (Index * ElSize) % 128 == 0; 3794 3795 return Result; 3796} 3797 3798/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3799/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3800unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3801 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3802 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3803 3804 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3805 unsigned Mask = 0; 3806 for (int i = 0; i < NumOperands; ++i) { 3807 int Val = SVOp->getMaskElt(NumOperands-i-1); 3808 if (Val < 0) Val = 0; 3809 if (Val >= NumOperands) Val -= NumOperands; 3810 Mask |= Val; 3811 if (i != NumOperands - 1) 3812 Mask <<= Shift; 3813 } 3814 return Mask; 3815} 3816 3817/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3818/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3819unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3820 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3821 unsigned Mask = 0; 3822 // 8 nodes, but we only care about the last 4. 3823 for (unsigned i = 7; i >= 4; --i) { 3824 int Val = SVOp->getMaskElt(i); 3825 if (Val >= 0) 3826 Mask |= (Val - 4); 3827 if (i != 4) 3828 Mask <<= 2; 3829 } 3830 return Mask; 3831} 3832 3833/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3834/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3835unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3836 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3837 unsigned Mask = 0; 3838 // 8 nodes, but we only care about the first 4. 3839 for (int i = 3; i >= 0; --i) { 3840 int Val = SVOp->getMaskElt(i); 3841 if (Val >= 0) 3842 Mask |= Val; 3843 if (i != 0) 3844 Mask <<= 2; 3845 } 3846 return Mask; 3847} 3848 3849/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3850/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3851unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3852 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3853 EVT VVT = N->getValueType(0); 3854 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3855 int Val = 0; 3856 3857 unsigned i, e; 3858 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3859 Val = SVOp->getMaskElt(i); 3860 if (Val >= 0) 3861 break; 3862 } 3863 assert(Val - i > 0 && "PALIGNR imm should be positive"); 3864 return (Val - i) * EltSize; 3865} 3866 3867/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3868/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3869/// instructions. 3870unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3871 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3872 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3873 3874 uint64_t Index = 3875 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3876 3877 EVT VecVT = N->getOperand(0).getValueType(); 3878 EVT ElVT = VecVT.getVectorElementType(); 3879 3880 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3881 return Index / NumElemsPerChunk; 3882} 3883 3884/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3885/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3886/// instructions. 3887unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3888 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3889 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3890 3891 uint64_t Index = 3892 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3893 3894 EVT VecVT = N->getValueType(0); 3895 EVT ElVT = VecVT.getVectorElementType(); 3896 3897 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3898 return Index / NumElemsPerChunk; 3899} 3900 3901/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3902/// constant +0.0. 3903bool X86::isZeroNode(SDValue Elt) { 3904 return ((isa<ConstantSDNode>(Elt) && 3905 cast<ConstantSDNode>(Elt)->isNullValue()) || 3906 (isa<ConstantFPSDNode>(Elt) && 3907 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3908} 3909 3910/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3911/// their permute mask. 3912static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3913 SelectionDAG &DAG) { 3914 EVT VT = SVOp->getValueType(0); 3915 unsigned NumElems = VT.getVectorNumElements(); 3916 SmallVector<int, 8> MaskVec; 3917 3918 for (unsigned i = 0; i != NumElems; ++i) { 3919 int idx = SVOp->getMaskElt(i); 3920 if (idx < 0) 3921 MaskVec.push_back(idx); 3922 else if (idx < (int)NumElems) 3923 MaskVec.push_back(idx + NumElems); 3924 else 3925 MaskVec.push_back(idx - NumElems); 3926 } 3927 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3928 SVOp->getOperand(0), &MaskVec[0]); 3929} 3930 3931/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3932/// the two vector operands have swapped position. 3933static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3934 unsigned NumElems = VT.getVectorNumElements(); 3935 for (unsigned i = 0; i != NumElems; ++i) { 3936 int idx = Mask[i]; 3937 if (idx < 0) 3938 continue; 3939 else if (idx < (int)NumElems) 3940 Mask[i] = idx + NumElems; 3941 else 3942 Mask[i] = idx - NumElems; 3943 } 3944} 3945 3946/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3947/// match movhlps. The lower half elements should come from upper half of 3948/// V1 (and in order), and the upper half elements should come from the upper 3949/// half of V2 (and in order). 3950static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3951 EVT VT = Op->getValueType(0); 3952 if (VT.getSizeInBits() != 128) 3953 return false; 3954 if (VT.getVectorNumElements() != 4) 3955 return false; 3956 for (unsigned i = 0, e = 2; i != e; ++i) 3957 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3958 return false; 3959 for (unsigned i = 2; i != 4; ++i) 3960 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3961 return false; 3962 return true; 3963} 3964 3965/// isScalarLoadToVector - Returns true if the node is a scalar load that 3966/// is promoted to a vector. It also returns the LoadSDNode by reference if 3967/// required. 3968static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3969 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3970 return false; 3971 N = N->getOperand(0).getNode(); 3972 if (!ISD::isNON_EXTLoad(N)) 3973 return false; 3974 if (LD) 3975 *LD = cast<LoadSDNode>(N); 3976 return true; 3977} 3978 3979/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3980/// match movlp{s|d}. The lower half elements should come from lower half of 3981/// V1 (and in order), and the upper half elements should come from the upper 3982/// half of V2 (and in order). And since V1 will become the source of the 3983/// MOVLP, it must be either a vector load or a scalar load to vector. 3984static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3985 ShuffleVectorSDNode *Op) { 3986 EVT VT = Op->getValueType(0); 3987 if (VT.getSizeInBits() != 128) 3988 return false; 3989 3990 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3991 return false; 3992 // Is V2 is a vector load, don't do this transformation. We will try to use 3993 // load folding shufps op. 3994 if (ISD::isNON_EXTLoad(V2)) 3995 return false; 3996 3997 unsigned NumElems = VT.getVectorNumElements(); 3998 3999 if (NumElems != 2 && NumElems != 4) 4000 return false; 4001 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4002 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 4003 return false; 4004 for (unsigned i = NumElems/2; i != NumElems; ++i) 4005 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 4006 return false; 4007 return true; 4008} 4009 4010/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4011/// all the same. 4012static bool isSplatVector(SDNode *N) { 4013 if (N->getOpcode() != ISD::BUILD_VECTOR) 4014 return false; 4015 4016 SDValue SplatValue = N->getOperand(0); 4017 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4018 if (N->getOperand(i) != SplatValue) 4019 return false; 4020 return true; 4021} 4022 4023/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4024/// to an zero vector. 4025/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4026static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4027 SDValue V1 = N->getOperand(0); 4028 SDValue V2 = N->getOperand(1); 4029 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4030 for (unsigned i = 0; i != NumElems; ++i) { 4031 int Idx = N->getMaskElt(i); 4032 if (Idx >= (int)NumElems) { 4033 unsigned Opc = V2.getOpcode(); 4034 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4035 continue; 4036 if (Opc != ISD::BUILD_VECTOR || 4037 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4038 return false; 4039 } else if (Idx >= 0) { 4040 unsigned Opc = V1.getOpcode(); 4041 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4042 continue; 4043 if (Opc != ISD::BUILD_VECTOR || 4044 !X86::isZeroNode(V1.getOperand(Idx))) 4045 return false; 4046 } 4047 } 4048 return true; 4049} 4050 4051/// getZeroVector - Returns a vector of specified type with all zero elements. 4052/// 4053static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 4054 DebugLoc dl) { 4055 assert(VT.isVector() && "Expected a vector type"); 4056 4057 // Always build SSE zero vectors as <4 x i32> bitcasted 4058 // to their dest type. This ensures they get CSE'd. 4059 SDValue Vec; 4060 if (VT.getSizeInBits() == 128) { // SSE 4061 if (HasSSE2) { // SSE2 4062 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4063 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4064 } else { // SSE1 4065 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4066 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4067 } 4068 } else if (VT.getSizeInBits() == 256) { // AVX 4069 // 256-bit logic and arithmetic instructions in AVX are 4070 // all floating-point, no support for integer ops. Default 4071 // to emitting fp zeroed vectors then. 4072 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4073 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4074 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4075 } 4076 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4077} 4078 4079/// getOnesVector - Returns a vector of specified type with all bits set. 4080/// Always build ones vectors as <4 x i32>. For 256-bit types, use two 4081/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their 4082/// original type, ensuring they get CSE'd. 4083static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 4084 assert(VT.isVector() && "Expected a vector type"); 4085 assert((VT.is128BitVector() || VT.is256BitVector()) 4086 && "Expected a 128-bit or 256-bit vector type"); 4087 4088 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4089 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 4090 Cst, Cst, Cst, Cst); 4091 4092 if (VT.is256BitVector()) { 4093 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 4094 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 4095 Vec = Insert128BitVector(InsV, Vec, 4096 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 4097 } 4098 4099 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4100} 4101 4102/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4103/// that point to V2 points to its first element. 4104static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4105 EVT VT = SVOp->getValueType(0); 4106 unsigned NumElems = VT.getVectorNumElements(); 4107 4108 bool Changed = false; 4109 SmallVector<int, 8> MaskVec; 4110 SVOp->getMask(MaskVec); 4111 4112 for (unsigned i = 0; i != NumElems; ++i) { 4113 if (MaskVec[i] > (int)NumElems) { 4114 MaskVec[i] = NumElems; 4115 Changed = true; 4116 } 4117 } 4118 if (Changed) 4119 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 4120 SVOp->getOperand(1), &MaskVec[0]); 4121 return SDValue(SVOp, 0); 4122} 4123 4124/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4125/// operation of specified width. 4126static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4127 SDValue V2) { 4128 unsigned NumElems = VT.getVectorNumElements(); 4129 SmallVector<int, 8> Mask; 4130 Mask.push_back(NumElems); 4131 for (unsigned i = 1; i != NumElems; ++i) 4132 Mask.push_back(i); 4133 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4134} 4135 4136/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4137static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4138 SDValue V2) { 4139 unsigned NumElems = VT.getVectorNumElements(); 4140 SmallVector<int, 8> Mask; 4141 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4142 Mask.push_back(i); 4143 Mask.push_back(i + NumElems); 4144 } 4145 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4146} 4147 4148/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4149static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4150 SDValue V2) { 4151 unsigned NumElems = VT.getVectorNumElements(); 4152 unsigned Half = NumElems/2; 4153 SmallVector<int, 8> Mask; 4154 for (unsigned i = 0; i != Half; ++i) { 4155 Mask.push_back(i + Half); 4156 Mask.push_back(i + NumElems + Half); 4157 } 4158 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4159} 4160 4161// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4162// a generic shuffle instruction because the target has no such instructions. 4163// Generate shuffles which repeat i16 and i8 several times until they can be 4164// represented by v4f32 and then be manipulated by target suported shuffles. 4165static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4166 EVT VT = V.getValueType(); 4167 int NumElems = VT.getVectorNumElements(); 4168 DebugLoc dl = V.getDebugLoc(); 4169 4170 while (NumElems > 4) { 4171 if (EltNo < NumElems/2) { 4172 V = getUnpackl(DAG, dl, VT, V, V); 4173 } else { 4174 V = getUnpackh(DAG, dl, VT, V, V); 4175 EltNo -= NumElems/2; 4176 } 4177 NumElems >>= 1; 4178 } 4179 return V; 4180} 4181 4182/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4183static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4184 EVT VT = V.getValueType(); 4185 DebugLoc dl = V.getDebugLoc(); 4186 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4187 && "Vector size not supported"); 4188 4189 bool Is128 = VT.getSizeInBits() == 128; 4190 EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32; 4191 V = DAG.getNode(ISD::BITCAST, dl, NVT, V); 4192 4193 if (Is128) { 4194 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4195 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4196 } else { 4197 // The second half of indicies refer to the higher part, which is a 4198 // duplication of the lower one. This makes this shuffle a perfect match 4199 // for the VPERM instruction. 4200 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4201 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4202 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4203 } 4204 4205 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4206} 4207 4208/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4209static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4210 EVT SrcVT = SV->getValueType(0); 4211 SDValue V1 = SV->getOperand(0); 4212 DebugLoc dl = SV->getDebugLoc(); 4213 4214 int EltNo = SV->getSplatIndex(); 4215 int NumElems = SrcVT.getVectorNumElements(); 4216 unsigned Size = SrcVT.getSizeInBits(); 4217 4218 // Extract the 128-bit part containing the splat element and update 4219 // the splat element index when it refers to the higher register. 4220 if (Size == 256) { 4221 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4222 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4223 if (Idx > 0) 4224 EltNo -= NumElems/2; 4225 } 4226 4227 // All i16 and i8 vector types can't be used directly by a generic shuffle 4228 // instruction because the target has no such instruction. Generate shuffles 4229 // which repeat i16 and i8 several times until they fit in i32, and then can 4230 // be manipulated by target suported shuffles. After the insertion of the 4231 // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. 4232 EVT EltVT = SrcVT.getVectorElementType(); 4233 if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16)) 4234 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4235 4236 // Recreate the 256-bit vector and place the same 128-bit vector 4237 // into the low and high part. This is necessary because we want 4238 // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles 4239 // inside each separate v4f32 lane. 4240 if (Size == 256) { 4241 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4242 DAG.getConstant(0, MVT::i32), DAG, dl); 4243 V1 = Insert128BitVector(InsV, V1, 4244 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4245 } 4246 4247 return getLegalSplat(DAG, V1, EltNo); 4248} 4249 4250/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4251/// vector of zero or undef vector. This produces a shuffle where the low 4252/// element of V2 is swizzled into the zero/undef vector, landing at element 4253/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4254static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4255 bool isZero, bool HasSSE2, 4256 SelectionDAG &DAG) { 4257 EVT VT = V2.getValueType(); 4258 SDValue V1 = isZero 4259 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4260 unsigned NumElems = VT.getVectorNumElements(); 4261 SmallVector<int, 16> MaskVec; 4262 for (unsigned i = 0; i != NumElems; ++i) 4263 // If this is the insertion idx, put the low elt of V2 here. 4264 MaskVec.push_back(i == Idx ? NumElems : i); 4265 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4266} 4267 4268/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4269/// element of the result of the vector shuffle. 4270static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4271 unsigned Depth) { 4272 if (Depth == 6) 4273 return SDValue(); // Limit search depth. 4274 4275 SDValue V = SDValue(N, 0); 4276 EVT VT = V.getValueType(); 4277 unsigned Opcode = V.getOpcode(); 4278 4279 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4280 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4281 Index = SV->getMaskElt(Index); 4282 4283 if (Index < 0) 4284 return DAG.getUNDEF(VT.getVectorElementType()); 4285 4286 int NumElems = VT.getVectorNumElements(); 4287 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4288 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4289 } 4290 4291 // Recurse into target specific vector shuffles to find scalars. 4292 if (isTargetShuffle(Opcode)) { 4293 int NumElems = VT.getVectorNumElements(); 4294 SmallVector<unsigned, 16> ShuffleMask; 4295 SDValue ImmN; 4296 4297 switch(Opcode) { 4298 case X86ISD::SHUFPS: 4299 case X86ISD::SHUFPD: 4300 ImmN = N->getOperand(N->getNumOperands()-1); 4301 DecodeSHUFPSMask(NumElems, 4302 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4303 ShuffleMask); 4304 break; 4305 case X86ISD::PUNPCKHBW: 4306 case X86ISD::PUNPCKHWD: 4307 case X86ISD::PUNPCKHDQ: 4308 case X86ISD::PUNPCKHQDQ: 4309 DecodePUNPCKHMask(NumElems, ShuffleMask); 4310 break; 4311 case X86ISD::UNPCKHPS: 4312 case X86ISD::UNPCKHPD: 4313 case X86ISD::VUNPCKHPSY: 4314 case X86ISD::VUNPCKHPDY: 4315 DecodeUNPCKHPMask(NumElems, ShuffleMask); 4316 break; 4317 case X86ISD::PUNPCKLBW: 4318 case X86ISD::PUNPCKLWD: 4319 case X86ISD::PUNPCKLDQ: 4320 case X86ISD::PUNPCKLQDQ: 4321 DecodePUNPCKLMask(VT, ShuffleMask); 4322 break; 4323 case X86ISD::UNPCKLPS: 4324 case X86ISD::UNPCKLPD: 4325 case X86ISD::VUNPCKLPSY: 4326 case X86ISD::VUNPCKLPDY: 4327 DecodeUNPCKLPMask(VT, ShuffleMask); 4328 break; 4329 case X86ISD::MOVHLPS: 4330 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4331 break; 4332 case X86ISD::MOVLHPS: 4333 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4334 break; 4335 case X86ISD::PSHUFD: 4336 ImmN = N->getOperand(N->getNumOperands()-1); 4337 DecodePSHUFMask(NumElems, 4338 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4339 ShuffleMask); 4340 break; 4341 case X86ISD::PSHUFHW: 4342 ImmN = N->getOperand(N->getNumOperands()-1); 4343 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4344 ShuffleMask); 4345 break; 4346 case X86ISD::PSHUFLW: 4347 ImmN = N->getOperand(N->getNumOperands()-1); 4348 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4349 ShuffleMask); 4350 break; 4351 case X86ISD::MOVSS: 4352 case X86ISD::MOVSD: { 4353 // The index 0 always comes from the first element of the second source, 4354 // this is why MOVSS and MOVSD are used in the first place. The other 4355 // elements come from the other positions of the first source vector. 4356 unsigned OpNum = (Index == 0) ? 1 : 0; 4357 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4358 Depth+1); 4359 } 4360 case X86ISD::VPERMILPS: 4361 ImmN = N->getOperand(N->getNumOperands()-1); 4362 DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4363 ShuffleMask); 4364 break; 4365 case X86ISD::VPERMILPSY: 4366 ImmN = N->getOperand(N->getNumOperands()-1); 4367 DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4368 ShuffleMask); 4369 break; 4370 case X86ISD::VPERMILPD: 4371 ImmN = N->getOperand(N->getNumOperands()-1); 4372 DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4373 ShuffleMask); 4374 break; 4375 case X86ISD::VPERMILPDY: 4376 ImmN = N->getOperand(N->getNumOperands()-1); 4377 DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4378 ShuffleMask); 4379 break; 4380 case X86ISD::VPERM2F128: 4381 ImmN = N->getOperand(N->getNumOperands()-1); 4382 DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4383 ShuffleMask); 4384 break; 4385 default: 4386 assert("not implemented for target shuffle node"); 4387 return SDValue(); 4388 } 4389 4390 Index = ShuffleMask[Index]; 4391 if (Index < 0) 4392 return DAG.getUNDEF(VT.getVectorElementType()); 4393 4394 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4395 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4396 Depth+1); 4397 } 4398 4399 // Actual nodes that may contain scalar elements 4400 if (Opcode == ISD::BITCAST) { 4401 V = V.getOperand(0); 4402 EVT SrcVT = V.getValueType(); 4403 unsigned NumElems = VT.getVectorNumElements(); 4404 4405 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4406 return SDValue(); 4407 } 4408 4409 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4410 return (Index == 0) ? V.getOperand(0) 4411 : DAG.getUNDEF(VT.getVectorElementType()); 4412 4413 if (V.getOpcode() == ISD::BUILD_VECTOR) 4414 return V.getOperand(Index); 4415 4416 return SDValue(); 4417} 4418 4419/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4420/// shuffle operation which come from a consecutively from a zero. The 4421/// search can start in two different directions, from left or right. 4422static 4423unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4424 bool ZerosFromLeft, SelectionDAG &DAG) { 4425 int i = 0; 4426 4427 while (i < NumElems) { 4428 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4429 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4430 if (!(Elt.getNode() && 4431 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4432 break; 4433 ++i; 4434 } 4435 4436 return i; 4437} 4438 4439/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4440/// MaskE correspond consecutively to elements from one of the vector operands, 4441/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4442static 4443bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4444 int OpIdx, int NumElems, unsigned &OpNum) { 4445 bool SeenV1 = false; 4446 bool SeenV2 = false; 4447 4448 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4449 int Idx = SVOp->getMaskElt(i); 4450 // Ignore undef indicies 4451 if (Idx < 0) 4452 continue; 4453 4454 if (Idx < NumElems) 4455 SeenV1 = true; 4456 else 4457 SeenV2 = true; 4458 4459 // Only accept consecutive elements from the same vector 4460 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4461 return false; 4462 } 4463 4464 OpNum = SeenV1 ? 0 : 1; 4465 return true; 4466} 4467 4468/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4469/// logical left shift of a vector. 4470static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4471 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4472 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4473 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4474 false /* check zeros from right */, DAG); 4475 unsigned OpSrc; 4476 4477 if (!NumZeros) 4478 return false; 4479 4480 // Considering the elements in the mask that are not consecutive zeros, 4481 // check if they consecutively come from only one of the source vectors. 4482 // 4483 // V1 = {X, A, B, C} 0 4484 // \ \ \ / 4485 // vector_shuffle V1, V2 <1, 2, 3, X> 4486 // 4487 if (!isShuffleMaskConsecutive(SVOp, 4488 0, // Mask Start Index 4489 NumElems-NumZeros-1, // Mask End Index 4490 NumZeros, // Where to start looking in the src vector 4491 NumElems, // Number of elements in vector 4492 OpSrc)) // Which source operand ? 4493 return false; 4494 4495 isLeft = false; 4496 ShAmt = NumZeros; 4497 ShVal = SVOp->getOperand(OpSrc); 4498 return true; 4499} 4500 4501/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4502/// logical left shift of a vector. 4503static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4504 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4505 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4506 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4507 true /* check zeros from left */, DAG); 4508 unsigned OpSrc; 4509 4510 if (!NumZeros) 4511 return false; 4512 4513 // Considering the elements in the mask that are not consecutive zeros, 4514 // check if they consecutively come from only one of the source vectors. 4515 // 4516 // 0 { A, B, X, X } = V2 4517 // / \ / / 4518 // vector_shuffle V1, V2 <X, X, 4, 5> 4519 // 4520 if (!isShuffleMaskConsecutive(SVOp, 4521 NumZeros, // Mask Start Index 4522 NumElems-1, // Mask End Index 4523 0, // Where to start looking in the src vector 4524 NumElems, // Number of elements in vector 4525 OpSrc)) // Which source operand ? 4526 return false; 4527 4528 isLeft = true; 4529 ShAmt = NumZeros; 4530 ShVal = SVOp->getOperand(OpSrc); 4531 return true; 4532} 4533 4534/// isVectorShift - Returns true if the shuffle can be implemented as a 4535/// logical left or right shift of a vector. 4536static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4537 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4538 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4539 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4540 return true; 4541 4542 return false; 4543} 4544 4545/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4546/// 4547static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4548 unsigned NumNonZero, unsigned NumZero, 4549 SelectionDAG &DAG, 4550 const TargetLowering &TLI) { 4551 if (NumNonZero > 8) 4552 return SDValue(); 4553 4554 DebugLoc dl = Op.getDebugLoc(); 4555 SDValue V(0, 0); 4556 bool First = true; 4557 for (unsigned i = 0; i < 16; ++i) { 4558 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4559 if (ThisIsNonZero && First) { 4560 if (NumZero) 4561 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4562 else 4563 V = DAG.getUNDEF(MVT::v8i16); 4564 First = false; 4565 } 4566 4567 if ((i & 1) != 0) { 4568 SDValue ThisElt(0, 0), LastElt(0, 0); 4569 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4570 if (LastIsNonZero) { 4571 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4572 MVT::i16, Op.getOperand(i-1)); 4573 } 4574 if (ThisIsNonZero) { 4575 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4576 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4577 ThisElt, DAG.getConstant(8, MVT::i8)); 4578 if (LastIsNonZero) 4579 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4580 } else 4581 ThisElt = LastElt; 4582 4583 if (ThisElt.getNode()) 4584 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4585 DAG.getIntPtrConstant(i/2)); 4586 } 4587 } 4588 4589 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4590} 4591 4592/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4593/// 4594static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4595 unsigned NumNonZero, unsigned NumZero, 4596 SelectionDAG &DAG, 4597 const TargetLowering &TLI) { 4598 if (NumNonZero > 4) 4599 return SDValue(); 4600 4601 DebugLoc dl = Op.getDebugLoc(); 4602 SDValue V(0, 0); 4603 bool First = true; 4604 for (unsigned i = 0; i < 8; ++i) { 4605 bool isNonZero = (NonZeros & (1 << i)) != 0; 4606 if (isNonZero) { 4607 if (First) { 4608 if (NumZero) 4609 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4610 else 4611 V = DAG.getUNDEF(MVT::v8i16); 4612 First = false; 4613 } 4614 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4615 MVT::v8i16, V, Op.getOperand(i), 4616 DAG.getIntPtrConstant(i)); 4617 } 4618 } 4619 4620 return V; 4621} 4622 4623/// getVShift - Return a vector logical shift node. 4624/// 4625static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4626 unsigned NumBits, SelectionDAG &DAG, 4627 const TargetLowering &TLI, DebugLoc dl) { 4628 EVT ShVT = MVT::v2i64; 4629 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4630 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4631 return DAG.getNode(ISD::BITCAST, dl, VT, 4632 DAG.getNode(Opc, dl, ShVT, SrcOp, 4633 DAG.getConstant(NumBits, 4634 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4635} 4636 4637SDValue 4638X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4639 SelectionDAG &DAG) const { 4640 4641 // Check if the scalar load can be widened into a vector load. And if 4642 // the address is "base + cst" see if the cst can be "absorbed" into 4643 // the shuffle mask. 4644 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4645 SDValue Ptr = LD->getBasePtr(); 4646 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4647 return SDValue(); 4648 EVT PVT = LD->getValueType(0); 4649 if (PVT != MVT::i32 && PVT != MVT::f32) 4650 return SDValue(); 4651 4652 int FI = -1; 4653 int64_t Offset = 0; 4654 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4655 FI = FINode->getIndex(); 4656 Offset = 0; 4657 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4658 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4659 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4660 Offset = Ptr.getConstantOperandVal(1); 4661 Ptr = Ptr.getOperand(0); 4662 } else { 4663 return SDValue(); 4664 } 4665 4666 // FIXME: 256-bit vector instructions don't require a strict alignment, 4667 // improve this code to support it better. 4668 unsigned RequiredAlign = VT.getSizeInBits()/8; 4669 SDValue Chain = LD->getChain(); 4670 // Make sure the stack object alignment is at least 16 or 32. 4671 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4672 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4673 if (MFI->isFixedObjectIndex(FI)) { 4674 // Can't change the alignment. FIXME: It's possible to compute 4675 // the exact stack offset and reference FI + adjust offset instead. 4676 // If someone *really* cares about this. That's the way to implement it. 4677 return SDValue(); 4678 } else { 4679 MFI->setObjectAlignment(FI, RequiredAlign); 4680 } 4681 } 4682 4683 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4684 // Ptr + (Offset & ~15). 4685 if (Offset < 0) 4686 return SDValue(); 4687 if ((Offset % RequiredAlign) & 3) 4688 return SDValue(); 4689 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4690 if (StartOffset) 4691 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4692 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4693 4694 int EltNo = (Offset - StartOffset) >> 2; 4695 int NumElems = VT.getVectorNumElements(); 4696 4697 EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; 4698 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4699 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4700 LD->getPointerInfo().getWithOffset(StartOffset), 4701 false, false, 0); 4702 4703 // Canonicalize it to a v4i32 or v8i32 shuffle. 4704 SmallVector<int, 8> Mask; 4705 for (int i = 0; i < NumElems; ++i) 4706 Mask.push_back(EltNo); 4707 4708 V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); 4709 return DAG.getNode(ISD::BITCAST, dl, NVT, 4710 DAG.getVectorShuffle(CanonVT, dl, V1, 4711 DAG.getUNDEF(CanonVT),&Mask[0])); 4712 } 4713 4714 return SDValue(); 4715} 4716 4717/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4718/// vector of type 'VT', see if the elements can be replaced by a single large 4719/// load which has the same value as a build_vector whose operands are 'elts'. 4720/// 4721/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4722/// 4723/// FIXME: we'd also like to handle the case where the last elements are zero 4724/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4725/// There's even a handy isZeroNode for that purpose. 4726static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4727 DebugLoc &DL, SelectionDAG &DAG) { 4728 EVT EltVT = VT.getVectorElementType(); 4729 unsigned NumElems = Elts.size(); 4730 4731 LoadSDNode *LDBase = NULL; 4732 unsigned LastLoadedElt = -1U; 4733 4734 // For each element in the initializer, see if we've found a load or an undef. 4735 // If we don't find an initial load element, or later load elements are 4736 // non-consecutive, bail out. 4737 for (unsigned i = 0; i < NumElems; ++i) { 4738 SDValue Elt = Elts[i]; 4739 4740 if (!Elt.getNode() || 4741 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4742 return SDValue(); 4743 if (!LDBase) { 4744 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4745 return SDValue(); 4746 LDBase = cast<LoadSDNode>(Elt.getNode()); 4747 LastLoadedElt = i; 4748 continue; 4749 } 4750 if (Elt.getOpcode() == ISD::UNDEF) 4751 continue; 4752 4753 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4754 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4755 return SDValue(); 4756 LastLoadedElt = i; 4757 } 4758 4759 // If we have found an entire vector of loads and undefs, then return a large 4760 // load of the entire vector width starting at the base pointer. If we found 4761 // consecutive loads for the low half, generate a vzext_load node. 4762 if (LastLoadedElt == NumElems - 1) { 4763 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4764 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4765 LDBase->getPointerInfo(), 4766 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4767 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4768 LDBase->getPointerInfo(), 4769 LDBase->isVolatile(), LDBase->isNonTemporal(), 4770 LDBase->getAlignment()); 4771 } else if (NumElems == 4 && LastLoadedElt == 1 && 4772 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4773 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4774 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4775 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4776 Ops, 2, MVT::i32, 4777 LDBase->getMemOperand()); 4778 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4779 } 4780 return SDValue(); 4781} 4782 4783SDValue 4784X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4785 DebugLoc dl = Op.getDebugLoc(); 4786 4787 EVT VT = Op.getValueType(); 4788 EVT ExtVT = VT.getVectorElementType(); 4789 unsigned NumElems = Op.getNumOperands(); 4790 4791 // Vectors containing all zeros can be matched by pxor and xorps later 4792 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 4793 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 4794 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 4795 if (Op.getValueType() == MVT::v4i32 || 4796 Op.getValueType() == MVT::v8i32) 4797 return Op; 4798 4799 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4800 } 4801 4802 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 4803 // vectors or broken into v4i32 operations on 256-bit vectors. 4804 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 4805 if (Op.getValueType() == MVT::v4i32) 4806 return Op; 4807 4808 return getOnesVector(Op.getValueType(), DAG, dl); 4809 } 4810 4811 unsigned EVTBits = ExtVT.getSizeInBits(); 4812 4813 unsigned NumZero = 0; 4814 unsigned NumNonZero = 0; 4815 unsigned NonZeros = 0; 4816 bool IsAllConstants = true; 4817 SmallSet<SDValue, 8> Values; 4818 for (unsigned i = 0; i < NumElems; ++i) { 4819 SDValue Elt = Op.getOperand(i); 4820 if (Elt.getOpcode() == ISD::UNDEF) 4821 continue; 4822 Values.insert(Elt); 4823 if (Elt.getOpcode() != ISD::Constant && 4824 Elt.getOpcode() != ISD::ConstantFP) 4825 IsAllConstants = false; 4826 if (X86::isZeroNode(Elt)) 4827 NumZero++; 4828 else { 4829 NonZeros |= (1 << i); 4830 NumNonZero++; 4831 } 4832 } 4833 4834 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4835 if (NumNonZero == 0) 4836 return DAG.getUNDEF(VT); 4837 4838 // Special case for single non-zero, non-undef, element. 4839 if (NumNonZero == 1) { 4840 unsigned Idx = CountTrailingZeros_32(NonZeros); 4841 SDValue Item = Op.getOperand(Idx); 4842 4843 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4844 // the value are obviously zero, truncate the value to i32 and do the 4845 // insertion that way. Only do this if the value is non-constant or if the 4846 // value is a constant being inserted into element 0. It is cheaper to do 4847 // a constant pool load than it is to do a movd + shuffle. 4848 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4849 (!IsAllConstants || Idx == 0)) { 4850 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4851 // Handle SSE only. 4852 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4853 EVT VecVT = MVT::v4i32; 4854 unsigned VecElts = 4; 4855 4856 // Truncate the value (which may itself be a constant) to i32, and 4857 // convert it to a vector with movd (S2V+shuffle to zero extend). 4858 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4859 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4860 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4861 Subtarget->hasSSE2(), DAG); 4862 4863 // Now we have our 32-bit value zero extended in the low element of 4864 // a vector. If Idx != 0, swizzle it into place. 4865 if (Idx != 0) { 4866 SmallVector<int, 4> Mask; 4867 Mask.push_back(Idx); 4868 for (unsigned i = 1; i != VecElts; ++i) 4869 Mask.push_back(i); 4870 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4871 DAG.getUNDEF(Item.getValueType()), 4872 &Mask[0]); 4873 } 4874 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4875 } 4876 } 4877 4878 // If we have a constant or non-constant insertion into the low element of 4879 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4880 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4881 // depending on what the source datatype is. 4882 if (Idx == 0) { 4883 if (NumZero == 0) { 4884 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4885 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4886 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4887 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4888 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4889 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4890 DAG); 4891 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4892 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4893 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4894 EVT MiddleVT = MVT::v4i32; 4895 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4896 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4897 Subtarget->hasSSE2(), DAG); 4898 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4899 } 4900 } 4901 4902 // Is it a vector logical left shift? 4903 if (NumElems == 2 && Idx == 1 && 4904 X86::isZeroNode(Op.getOperand(0)) && 4905 !X86::isZeroNode(Op.getOperand(1))) { 4906 unsigned NumBits = VT.getSizeInBits(); 4907 return getVShift(true, VT, 4908 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4909 VT, Op.getOperand(1)), 4910 NumBits/2, DAG, *this, dl); 4911 } 4912 4913 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4914 return SDValue(); 4915 4916 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4917 // is a non-constant being inserted into an element other than the low one, 4918 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4919 // movd/movss) to move this into the low element, then shuffle it into 4920 // place. 4921 if (EVTBits == 32) { 4922 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4923 4924 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4925 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4926 Subtarget->hasSSE2(), DAG); 4927 SmallVector<int, 8> MaskVec; 4928 for (unsigned i = 0; i < NumElems; i++) 4929 MaskVec.push_back(i == Idx ? 0 : 1); 4930 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4931 } 4932 } 4933 4934 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4935 if (Values.size() == 1) { 4936 if (EVTBits == 32) { 4937 // Instead of a shuffle like this: 4938 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4939 // Check if it's possible to issue this instead. 4940 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4941 unsigned Idx = CountTrailingZeros_32(NonZeros); 4942 SDValue Item = Op.getOperand(Idx); 4943 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4944 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4945 } 4946 return SDValue(); 4947 } 4948 4949 // A vector full of immediates; various special cases are already 4950 // handled, so this is best done with a single constant-pool load. 4951 if (IsAllConstants) 4952 return SDValue(); 4953 4954 // For AVX-length vectors, build the individual 128-bit pieces and use 4955 // shuffles to put them in place. 4956 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 4957 SmallVector<SDValue, 32> V; 4958 for (unsigned i = 0; i < NumElems; ++i) 4959 V.push_back(Op.getOperand(i)); 4960 4961 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4962 4963 // Build both the lower and upper subvector. 4964 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4965 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4966 NumElems/2); 4967 4968 // Recreate the wider vector with the lower and upper part. 4969 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 4970 DAG.getConstant(0, MVT::i32), DAG, dl); 4971 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 4972 DAG, dl); 4973 } 4974 4975 // Let legalizer expand 2-wide build_vectors. 4976 if (EVTBits == 64) { 4977 if (NumNonZero == 1) { 4978 // One half is zero or undef. 4979 unsigned Idx = CountTrailingZeros_32(NonZeros); 4980 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4981 Op.getOperand(Idx)); 4982 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4983 Subtarget->hasSSE2(), DAG); 4984 } 4985 return SDValue(); 4986 } 4987 4988 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4989 if (EVTBits == 8 && NumElems == 16) { 4990 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4991 *this); 4992 if (V.getNode()) return V; 4993 } 4994 4995 if (EVTBits == 16 && NumElems == 8) { 4996 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4997 *this); 4998 if (V.getNode()) return V; 4999 } 5000 5001 // If element VT is == 32 bits, turn it into a number of shuffles. 5002 SmallVector<SDValue, 8> V; 5003 V.resize(NumElems); 5004 if (NumElems == 4 && NumZero > 0) { 5005 for (unsigned i = 0; i < 4; ++i) { 5006 bool isZero = !(NonZeros & (1 << i)); 5007 if (isZero) 5008 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5009 else 5010 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5011 } 5012 5013 for (unsigned i = 0; i < 2; ++i) { 5014 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5015 default: break; 5016 case 0: 5017 V[i] = V[i*2]; // Must be a zero vector. 5018 break; 5019 case 1: 5020 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5021 break; 5022 case 2: 5023 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5024 break; 5025 case 3: 5026 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5027 break; 5028 } 5029 } 5030 5031 SmallVector<int, 8> MaskVec; 5032 bool Reverse = (NonZeros & 0x3) == 2; 5033 for (unsigned i = 0; i < 2; ++i) 5034 MaskVec.push_back(Reverse ? 1-i : i); 5035 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5036 for (unsigned i = 0; i < 2; ++i) 5037 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 5038 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5039 } 5040 5041 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 5042 // Check for a build vector of consecutive loads. 5043 for (unsigned i = 0; i < NumElems; ++i) 5044 V[i] = Op.getOperand(i); 5045 5046 // Check for elements which are consecutive loads. 5047 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5048 if (LD.getNode()) 5049 return LD; 5050 5051 // For SSE 4.1, use insertps to put the high elements into the low element. 5052 if (getSubtarget()->hasSSE41()) { 5053 SDValue Result; 5054 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5055 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5056 else 5057 Result = DAG.getUNDEF(VT); 5058 5059 for (unsigned i = 1; i < NumElems; ++i) { 5060 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5061 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5062 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5063 } 5064 return Result; 5065 } 5066 5067 // Otherwise, expand into a number of unpckl*, start by extending each of 5068 // our (non-undef) elements to the full vector width with the element in the 5069 // bottom slot of the vector (which generates no code for SSE). 5070 for (unsigned i = 0; i < NumElems; ++i) { 5071 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5072 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5073 else 5074 V[i] = DAG.getUNDEF(VT); 5075 } 5076 5077 // Next, we iteratively mix elements, e.g. for v4f32: 5078 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5079 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5080 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5081 unsigned EltStride = NumElems >> 1; 5082 while (EltStride != 0) { 5083 for (unsigned i = 0; i < EltStride; ++i) { 5084 // If V[i+EltStride] is undef and this is the first round of mixing, 5085 // then it is safe to just drop this shuffle: V[i] is already in the 5086 // right place, the one element (since it's the first round) being 5087 // inserted as undef can be dropped. This isn't safe for successive 5088 // rounds because they will permute elements within both vectors. 5089 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5090 EltStride == NumElems/2) 5091 continue; 5092 5093 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5094 } 5095 EltStride >>= 1; 5096 } 5097 return V[0]; 5098 } 5099 return SDValue(); 5100} 5101 5102// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place 5103// them in a MMX register. This is better than doing a stack convert. 5104static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5105 DebugLoc dl = Op.getDebugLoc(); 5106 EVT ResVT = Op.getValueType(); 5107 5108 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 5109 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 5110 int Mask[2]; 5111 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 5112 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5113 InVec = Op.getOperand(1); 5114 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5115 unsigned NumElts = ResVT.getVectorNumElements(); 5116 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5117 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 5118 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 5119 } else { 5120 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 5121 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5122 Mask[0] = 0; Mask[1] = 2; 5123 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 5124 } 5125 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5126} 5127 5128// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5129// to create 256-bit vectors from two other 128-bit ones. 5130static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5131 DebugLoc dl = Op.getDebugLoc(); 5132 EVT ResVT = Op.getValueType(); 5133 5134 assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); 5135 5136 SDValue V1 = Op.getOperand(0); 5137 SDValue V2 = Op.getOperand(1); 5138 unsigned NumElems = ResVT.getVectorNumElements(); 5139 5140 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, 5141 DAG.getConstant(0, MVT::i32), DAG, dl); 5142 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5143 DAG, dl); 5144} 5145 5146SDValue 5147X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5148 EVT ResVT = Op.getValueType(); 5149 5150 assert(Op.getNumOperands() == 2); 5151 assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && 5152 "Unsupported CONCAT_VECTORS for value type"); 5153 5154 // We support concatenate two MMX registers and place them in a MMX register. 5155 // This is better than doing a stack convert. 5156 if (ResVT.is128BitVector()) 5157 return LowerMMXCONCAT_VECTORS(Op, DAG); 5158 5159 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5160 // from two other 128-bit ones. 5161 return LowerAVXCONCAT_VECTORS(Op, DAG); 5162} 5163 5164// v8i16 shuffles - Prefer shuffles in the following order: 5165// 1. [all] pshuflw, pshufhw, optional move 5166// 2. [ssse3] 1 x pshufb 5167// 3. [ssse3] 2 x pshufb + 1 x por 5168// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5169SDValue 5170X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5171 SelectionDAG &DAG) const { 5172 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5173 SDValue V1 = SVOp->getOperand(0); 5174 SDValue V2 = SVOp->getOperand(1); 5175 DebugLoc dl = SVOp->getDebugLoc(); 5176 SmallVector<int, 8> MaskVals; 5177 5178 // Determine if more than 1 of the words in each of the low and high quadwords 5179 // of the result come from the same quadword of one of the two inputs. Undef 5180 // mask values count as coming from any quadword, for better codegen. 5181 SmallVector<unsigned, 4> LoQuad(4); 5182 SmallVector<unsigned, 4> HiQuad(4); 5183 BitVector InputQuads(4); 5184 for (unsigned i = 0; i < 8; ++i) { 5185 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 5186 int EltIdx = SVOp->getMaskElt(i); 5187 MaskVals.push_back(EltIdx); 5188 if (EltIdx < 0) { 5189 ++Quad[0]; 5190 ++Quad[1]; 5191 ++Quad[2]; 5192 ++Quad[3]; 5193 continue; 5194 } 5195 ++Quad[EltIdx / 4]; 5196 InputQuads.set(EltIdx / 4); 5197 } 5198 5199 int BestLoQuad = -1; 5200 unsigned MaxQuad = 1; 5201 for (unsigned i = 0; i < 4; ++i) { 5202 if (LoQuad[i] > MaxQuad) { 5203 BestLoQuad = i; 5204 MaxQuad = LoQuad[i]; 5205 } 5206 } 5207 5208 int BestHiQuad = -1; 5209 MaxQuad = 1; 5210 for (unsigned i = 0; i < 4; ++i) { 5211 if (HiQuad[i] > MaxQuad) { 5212 BestHiQuad = i; 5213 MaxQuad = HiQuad[i]; 5214 } 5215 } 5216 5217 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5218 // of the two input vectors, shuffle them into one input vector so only a 5219 // single pshufb instruction is necessary. If There are more than 2 input 5220 // quads, disable the next transformation since it does not help SSSE3. 5221 bool V1Used = InputQuads[0] || InputQuads[1]; 5222 bool V2Used = InputQuads[2] || InputQuads[3]; 5223 if (Subtarget->hasSSSE3()) { 5224 if (InputQuads.count() == 2 && V1Used && V2Used) { 5225 BestLoQuad = InputQuads.find_first(); 5226 BestHiQuad = InputQuads.find_next(BestLoQuad); 5227 } 5228 if (InputQuads.count() > 2) { 5229 BestLoQuad = -1; 5230 BestHiQuad = -1; 5231 } 5232 } 5233 5234 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5235 // the shuffle mask. If a quad is scored as -1, that means that it contains 5236 // words from all 4 input quadwords. 5237 SDValue NewV; 5238 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5239 SmallVector<int, 8> MaskV; 5240 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 5241 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 5242 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5243 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5244 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5245 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5246 5247 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5248 // source words for the shuffle, to aid later transformations. 5249 bool AllWordsInNewV = true; 5250 bool InOrder[2] = { true, true }; 5251 for (unsigned i = 0; i != 8; ++i) { 5252 int idx = MaskVals[i]; 5253 if (idx != (int)i) 5254 InOrder[i/4] = false; 5255 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5256 continue; 5257 AllWordsInNewV = false; 5258 break; 5259 } 5260 5261 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5262 if (AllWordsInNewV) { 5263 for (int i = 0; i != 8; ++i) { 5264 int idx = MaskVals[i]; 5265 if (idx < 0) 5266 continue; 5267 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5268 if ((idx != i) && idx < 4) 5269 pshufhw = false; 5270 if ((idx != i) && idx > 3) 5271 pshuflw = false; 5272 } 5273 V1 = NewV; 5274 V2Used = false; 5275 BestLoQuad = 0; 5276 BestHiQuad = 1; 5277 } 5278 5279 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5280 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5281 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5282 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5283 unsigned TargetMask = 0; 5284 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5285 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5286 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5287 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5288 V1 = NewV.getOperand(0); 5289 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5290 } 5291 } 5292 5293 // If we have SSSE3, and all words of the result are from 1 input vector, 5294 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5295 // is present, fall back to case 4. 5296 if (Subtarget->hasSSSE3()) { 5297 SmallVector<SDValue,16> pshufbMask; 5298 5299 // If we have elements from both input vectors, set the high bit of the 5300 // shuffle mask element to zero out elements that come from V2 in the V1 5301 // mask, and elements that come from V1 in the V2 mask, so that the two 5302 // results can be OR'd together. 5303 bool TwoInputs = V1Used && V2Used; 5304 for (unsigned i = 0; i != 8; ++i) { 5305 int EltIdx = MaskVals[i] * 2; 5306 if (TwoInputs && (EltIdx >= 16)) { 5307 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5308 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5309 continue; 5310 } 5311 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5312 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5313 } 5314 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5315 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5316 DAG.getNode(ISD::BUILD_VECTOR, dl, 5317 MVT::v16i8, &pshufbMask[0], 16)); 5318 if (!TwoInputs) 5319 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5320 5321 // Calculate the shuffle mask for the second input, shuffle it, and 5322 // OR it with the first shuffled input. 5323 pshufbMask.clear(); 5324 for (unsigned i = 0; i != 8; ++i) { 5325 int EltIdx = MaskVals[i] * 2; 5326 if (EltIdx < 16) { 5327 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5328 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5329 continue; 5330 } 5331 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5332 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5333 } 5334 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5335 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5336 DAG.getNode(ISD::BUILD_VECTOR, dl, 5337 MVT::v16i8, &pshufbMask[0], 16)); 5338 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5339 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5340 } 5341 5342 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5343 // and update MaskVals with new element order. 5344 BitVector InOrder(8); 5345 if (BestLoQuad >= 0) { 5346 SmallVector<int, 8> MaskV; 5347 for (int i = 0; i != 4; ++i) { 5348 int idx = MaskVals[i]; 5349 if (idx < 0) { 5350 MaskV.push_back(-1); 5351 InOrder.set(i); 5352 } else if ((idx / 4) == BestLoQuad) { 5353 MaskV.push_back(idx & 3); 5354 InOrder.set(i); 5355 } else { 5356 MaskV.push_back(-1); 5357 } 5358 } 5359 for (unsigned i = 4; i != 8; ++i) 5360 MaskV.push_back(i); 5361 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5362 &MaskV[0]); 5363 5364 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5365 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5366 NewV.getOperand(0), 5367 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5368 DAG); 5369 } 5370 5371 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5372 // and update MaskVals with the new element order. 5373 if (BestHiQuad >= 0) { 5374 SmallVector<int, 8> MaskV; 5375 for (unsigned i = 0; i != 4; ++i) 5376 MaskV.push_back(i); 5377 for (unsigned i = 4; i != 8; ++i) { 5378 int idx = MaskVals[i]; 5379 if (idx < 0) { 5380 MaskV.push_back(-1); 5381 InOrder.set(i); 5382 } else if ((idx / 4) == BestHiQuad) { 5383 MaskV.push_back((idx & 3) + 4); 5384 InOrder.set(i); 5385 } else { 5386 MaskV.push_back(-1); 5387 } 5388 } 5389 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5390 &MaskV[0]); 5391 5392 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5393 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5394 NewV.getOperand(0), 5395 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5396 DAG); 5397 } 5398 5399 // In case BestHi & BestLo were both -1, which means each quadword has a word 5400 // from each of the four input quadwords, calculate the InOrder bitvector now 5401 // before falling through to the insert/extract cleanup. 5402 if (BestLoQuad == -1 && BestHiQuad == -1) { 5403 NewV = V1; 5404 for (int i = 0; i != 8; ++i) 5405 if (MaskVals[i] < 0 || MaskVals[i] == i) 5406 InOrder.set(i); 5407 } 5408 5409 // The other elements are put in the right place using pextrw and pinsrw. 5410 for (unsigned i = 0; i != 8; ++i) { 5411 if (InOrder[i]) 5412 continue; 5413 int EltIdx = MaskVals[i]; 5414 if (EltIdx < 0) 5415 continue; 5416 SDValue ExtOp = (EltIdx < 8) 5417 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5418 DAG.getIntPtrConstant(EltIdx)) 5419 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5420 DAG.getIntPtrConstant(EltIdx - 8)); 5421 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5422 DAG.getIntPtrConstant(i)); 5423 } 5424 return NewV; 5425} 5426 5427// v16i8 shuffles - Prefer shuffles in the following order: 5428// 1. [ssse3] 1 x pshufb 5429// 2. [ssse3] 2 x pshufb + 1 x por 5430// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5431static 5432SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5433 SelectionDAG &DAG, 5434 const X86TargetLowering &TLI) { 5435 SDValue V1 = SVOp->getOperand(0); 5436 SDValue V2 = SVOp->getOperand(1); 5437 DebugLoc dl = SVOp->getDebugLoc(); 5438 SmallVector<int, 16> MaskVals; 5439 SVOp->getMask(MaskVals); 5440 5441 // If we have SSSE3, case 1 is generated when all result bytes come from 5442 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5443 // present, fall back to case 3. 5444 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5445 bool V1Only = true; 5446 bool V2Only = true; 5447 for (unsigned i = 0; i < 16; ++i) { 5448 int EltIdx = MaskVals[i]; 5449 if (EltIdx < 0) 5450 continue; 5451 if (EltIdx < 16) 5452 V2Only = false; 5453 else 5454 V1Only = false; 5455 } 5456 5457 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5458 if (TLI.getSubtarget()->hasSSSE3()) { 5459 SmallVector<SDValue,16> pshufbMask; 5460 5461 // If all result elements are from one input vector, then only translate 5462 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5463 // 5464 // Otherwise, we have elements from both input vectors, and must zero out 5465 // elements that come from V2 in the first mask, and V1 in the second mask 5466 // so that we can OR them together. 5467 bool TwoInputs = !(V1Only || V2Only); 5468 for (unsigned i = 0; i != 16; ++i) { 5469 int EltIdx = MaskVals[i]; 5470 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5471 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5472 continue; 5473 } 5474 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5475 } 5476 // If all the elements are from V2, assign it to V1 and return after 5477 // building the first pshufb. 5478 if (V2Only) 5479 V1 = V2; 5480 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5481 DAG.getNode(ISD::BUILD_VECTOR, dl, 5482 MVT::v16i8, &pshufbMask[0], 16)); 5483 if (!TwoInputs) 5484 return V1; 5485 5486 // Calculate the shuffle mask for the second input, shuffle it, and 5487 // OR it with the first shuffled input. 5488 pshufbMask.clear(); 5489 for (unsigned i = 0; i != 16; ++i) { 5490 int EltIdx = MaskVals[i]; 5491 if (EltIdx < 16) { 5492 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5493 continue; 5494 } 5495 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5496 } 5497 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5498 DAG.getNode(ISD::BUILD_VECTOR, dl, 5499 MVT::v16i8, &pshufbMask[0], 16)); 5500 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5501 } 5502 5503 // No SSSE3 - Calculate in place words and then fix all out of place words 5504 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5505 // the 16 different words that comprise the two doublequadword input vectors. 5506 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5507 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5508 SDValue NewV = V2Only ? V2 : V1; 5509 for (int i = 0; i != 8; ++i) { 5510 int Elt0 = MaskVals[i*2]; 5511 int Elt1 = MaskVals[i*2+1]; 5512 5513 // This word of the result is all undef, skip it. 5514 if (Elt0 < 0 && Elt1 < 0) 5515 continue; 5516 5517 // This word of the result is already in the correct place, skip it. 5518 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5519 continue; 5520 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5521 continue; 5522 5523 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5524 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5525 SDValue InsElt; 5526 5527 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5528 // using a single extract together, load it and store it. 5529 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5530 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5531 DAG.getIntPtrConstant(Elt1 / 2)); 5532 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5533 DAG.getIntPtrConstant(i)); 5534 continue; 5535 } 5536 5537 // If Elt1 is defined, extract it from the appropriate source. If the 5538 // source byte is not also odd, shift the extracted word left 8 bits 5539 // otherwise clear the bottom 8 bits if we need to do an or. 5540 if (Elt1 >= 0) { 5541 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5542 DAG.getIntPtrConstant(Elt1 / 2)); 5543 if ((Elt1 & 1) == 0) 5544 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5545 DAG.getConstant(8, 5546 TLI.getShiftAmountTy(InsElt.getValueType()))); 5547 else if (Elt0 >= 0) 5548 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5549 DAG.getConstant(0xFF00, MVT::i16)); 5550 } 5551 // If Elt0 is defined, extract it from the appropriate source. If the 5552 // source byte is not also even, shift the extracted word right 8 bits. If 5553 // Elt1 was also defined, OR the extracted values together before 5554 // inserting them in the result. 5555 if (Elt0 >= 0) { 5556 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5557 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5558 if ((Elt0 & 1) != 0) 5559 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5560 DAG.getConstant(8, 5561 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5562 else if (Elt1 >= 0) 5563 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5564 DAG.getConstant(0x00FF, MVT::i16)); 5565 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5566 : InsElt0; 5567 } 5568 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5569 DAG.getIntPtrConstant(i)); 5570 } 5571 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5572} 5573 5574/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5575/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5576/// done when every pair / quad of shuffle mask elements point to elements in 5577/// the right sequence. e.g. 5578/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5579static 5580SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5581 SelectionDAG &DAG, DebugLoc dl) { 5582 EVT VT = SVOp->getValueType(0); 5583 SDValue V1 = SVOp->getOperand(0); 5584 SDValue V2 = SVOp->getOperand(1); 5585 unsigned NumElems = VT.getVectorNumElements(); 5586 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5587 EVT NewVT; 5588 switch (VT.getSimpleVT().SimpleTy) { 5589 default: assert(false && "Unexpected!"); 5590 case MVT::v4f32: NewVT = MVT::v2f64; break; 5591 case MVT::v4i32: NewVT = MVT::v2i64; break; 5592 case MVT::v8i16: NewVT = MVT::v4i32; break; 5593 case MVT::v16i8: NewVT = MVT::v4i32; break; 5594 } 5595 5596 int Scale = NumElems / NewWidth; 5597 SmallVector<int, 8> MaskVec; 5598 for (unsigned i = 0; i < NumElems; i += Scale) { 5599 int StartIdx = -1; 5600 for (int j = 0; j < Scale; ++j) { 5601 int EltIdx = SVOp->getMaskElt(i+j); 5602 if (EltIdx < 0) 5603 continue; 5604 if (StartIdx == -1) 5605 StartIdx = EltIdx - (EltIdx % Scale); 5606 if (EltIdx != StartIdx + j) 5607 return SDValue(); 5608 } 5609 if (StartIdx == -1) 5610 MaskVec.push_back(-1); 5611 else 5612 MaskVec.push_back(StartIdx / Scale); 5613 } 5614 5615 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5616 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5617 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5618} 5619 5620/// getVZextMovL - Return a zero-extending vector move low node. 5621/// 5622static SDValue getVZextMovL(EVT VT, EVT OpVT, 5623 SDValue SrcOp, SelectionDAG &DAG, 5624 const X86Subtarget *Subtarget, DebugLoc dl) { 5625 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5626 LoadSDNode *LD = NULL; 5627 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5628 LD = dyn_cast<LoadSDNode>(SrcOp); 5629 if (!LD) { 5630 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5631 // instead. 5632 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5633 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5634 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5635 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5636 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5637 // PR2108 5638 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5639 return DAG.getNode(ISD::BITCAST, dl, VT, 5640 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5641 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5642 OpVT, 5643 SrcOp.getOperand(0) 5644 .getOperand(0)))); 5645 } 5646 } 5647 } 5648 5649 return DAG.getNode(ISD::BITCAST, dl, VT, 5650 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5651 DAG.getNode(ISD::BITCAST, dl, 5652 OpVT, SrcOp))); 5653} 5654 5655/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector 5656/// shuffle node referes to only one lane in the sources. 5657static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { 5658 EVT VT = SVOp->getValueType(0); 5659 int NumElems = VT.getVectorNumElements(); 5660 int HalfSize = NumElems/2; 5661 SmallVector<int, 16> M; 5662 SVOp->getMask(M); 5663 bool MatchA = false, MatchB = false; 5664 5665 for (int l = 0; l < NumElems*2; l += HalfSize) { 5666 if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { 5667 MatchA = true; 5668 break; 5669 } 5670 } 5671 5672 for (int l = 0; l < NumElems*2; l += HalfSize) { 5673 if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { 5674 MatchB = true; 5675 break; 5676 } 5677 } 5678 5679 return MatchA && MatchB; 5680} 5681 5682/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5683/// which could not be matched by any known target speficic shuffle 5684static SDValue 5685LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5686 if (areShuffleHalvesWithinDisjointLanes(SVOp)) { 5687 // If each half of a vector shuffle node referes to only one lane in the 5688 // source vectors, extract each used 128-bit lane and shuffle them using 5689 // 128-bit shuffles. Then, concatenate the results. Otherwise leave 5690 // the work to the legalizer. 5691 DebugLoc dl = SVOp->getDebugLoc(); 5692 EVT VT = SVOp->getValueType(0); 5693 int NumElems = VT.getVectorNumElements(); 5694 int HalfSize = NumElems/2; 5695 5696 // Extract the reference for each half 5697 int FstVecExtractIdx = 0, SndVecExtractIdx = 0; 5698 int FstVecOpNum = 0, SndVecOpNum = 0; 5699 for (int i = 0; i < HalfSize; ++i) { 5700 int Elt = SVOp->getMaskElt(i); 5701 if (SVOp->getMaskElt(i) < 0) 5702 continue; 5703 FstVecOpNum = Elt/NumElems; 5704 FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5705 break; 5706 } 5707 for (int i = HalfSize; i < NumElems; ++i) { 5708 int Elt = SVOp->getMaskElt(i); 5709 if (SVOp->getMaskElt(i) < 0) 5710 continue; 5711 SndVecOpNum = Elt/NumElems; 5712 SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5713 break; 5714 } 5715 5716 // Extract the subvectors 5717 SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), 5718 DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); 5719 SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), 5720 DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); 5721 5722 // Generate 128-bit shuffles 5723 SmallVector<int, 16> MaskV1, MaskV2; 5724 for (int i = 0; i < HalfSize; ++i) { 5725 int Elt = SVOp->getMaskElt(i); 5726 MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); 5727 } 5728 for (int i = HalfSize; i < NumElems; ++i) { 5729 int Elt = SVOp->getMaskElt(i); 5730 MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); 5731 } 5732 5733 EVT NVT = V1.getValueType(); 5734 V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); 5735 V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); 5736 5737 // Concatenate the result back 5738 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, 5739 DAG.getConstant(0, MVT::i32), DAG, dl); 5740 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5741 DAG, dl); 5742 } 5743 5744 return SDValue(); 5745} 5746 5747/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5748/// 4 elements, and match them with several different shuffle types. 5749static SDValue 5750LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5751 SDValue V1 = SVOp->getOperand(0); 5752 SDValue V2 = SVOp->getOperand(1); 5753 DebugLoc dl = SVOp->getDebugLoc(); 5754 EVT VT = SVOp->getValueType(0); 5755 5756 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5757 5758 SmallVector<std::pair<int, int>, 8> Locs; 5759 Locs.resize(4); 5760 SmallVector<int, 8> Mask1(4U, -1); 5761 SmallVector<int, 8> PermMask; 5762 SVOp->getMask(PermMask); 5763 5764 unsigned NumHi = 0; 5765 unsigned NumLo = 0; 5766 for (unsigned i = 0; i != 4; ++i) { 5767 int Idx = PermMask[i]; 5768 if (Idx < 0) { 5769 Locs[i] = std::make_pair(-1, -1); 5770 } else { 5771 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5772 if (Idx < 4) { 5773 Locs[i] = std::make_pair(0, NumLo); 5774 Mask1[NumLo] = Idx; 5775 NumLo++; 5776 } else { 5777 Locs[i] = std::make_pair(1, NumHi); 5778 if (2+NumHi < 4) 5779 Mask1[2+NumHi] = Idx; 5780 NumHi++; 5781 } 5782 } 5783 } 5784 5785 if (NumLo <= 2 && NumHi <= 2) { 5786 // If no more than two elements come from either vector. This can be 5787 // implemented with two shuffles. First shuffle gather the elements. 5788 // The second shuffle, which takes the first shuffle as both of its 5789 // vector operands, put the elements into the right order. 5790 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5791 5792 SmallVector<int, 8> Mask2(4U, -1); 5793 5794 for (unsigned i = 0; i != 4; ++i) { 5795 if (Locs[i].first == -1) 5796 continue; 5797 else { 5798 unsigned Idx = (i < 2) ? 0 : 4; 5799 Idx += Locs[i].first * 2 + Locs[i].second; 5800 Mask2[i] = Idx; 5801 } 5802 } 5803 5804 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5805 } else if (NumLo == 3 || NumHi == 3) { 5806 // Otherwise, we must have three elements from one vector, call it X, and 5807 // one element from the other, call it Y. First, use a shufps to build an 5808 // intermediate vector with the one element from Y and the element from X 5809 // that will be in the same half in the final destination (the indexes don't 5810 // matter). Then, use a shufps to build the final vector, taking the half 5811 // containing the element from Y from the intermediate, and the other half 5812 // from X. 5813 if (NumHi == 3) { 5814 // Normalize it so the 3 elements come from V1. 5815 CommuteVectorShuffleMask(PermMask, VT); 5816 std::swap(V1, V2); 5817 } 5818 5819 // Find the element from V2. 5820 unsigned HiIndex; 5821 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5822 int Val = PermMask[HiIndex]; 5823 if (Val < 0) 5824 continue; 5825 if (Val >= 4) 5826 break; 5827 } 5828 5829 Mask1[0] = PermMask[HiIndex]; 5830 Mask1[1] = -1; 5831 Mask1[2] = PermMask[HiIndex^1]; 5832 Mask1[3] = -1; 5833 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5834 5835 if (HiIndex >= 2) { 5836 Mask1[0] = PermMask[0]; 5837 Mask1[1] = PermMask[1]; 5838 Mask1[2] = HiIndex & 1 ? 6 : 4; 5839 Mask1[3] = HiIndex & 1 ? 4 : 6; 5840 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5841 } else { 5842 Mask1[0] = HiIndex & 1 ? 2 : 0; 5843 Mask1[1] = HiIndex & 1 ? 0 : 2; 5844 Mask1[2] = PermMask[2]; 5845 Mask1[3] = PermMask[3]; 5846 if (Mask1[2] >= 0) 5847 Mask1[2] += 4; 5848 if (Mask1[3] >= 0) 5849 Mask1[3] += 4; 5850 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5851 } 5852 } 5853 5854 // Break it into (shuffle shuffle_hi, shuffle_lo). 5855 Locs.clear(); 5856 Locs.resize(4); 5857 SmallVector<int,8> LoMask(4U, -1); 5858 SmallVector<int,8> HiMask(4U, -1); 5859 5860 SmallVector<int,8> *MaskPtr = &LoMask; 5861 unsigned MaskIdx = 0; 5862 unsigned LoIdx = 0; 5863 unsigned HiIdx = 2; 5864 for (unsigned i = 0; i != 4; ++i) { 5865 if (i == 2) { 5866 MaskPtr = &HiMask; 5867 MaskIdx = 1; 5868 LoIdx = 0; 5869 HiIdx = 2; 5870 } 5871 int Idx = PermMask[i]; 5872 if (Idx < 0) { 5873 Locs[i] = std::make_pair(-1, -1); 5874 } else if (Idx < 4) { 5875 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5876 (*MaskPtr)[LoIdx] = Idx; 5877 LoIdx++; 5878 } else { 5879 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5880 (*MaskPtr)[HiIdx] = Idx; 5881 HiIdx++; 5882 } 5883 } 5884 5885 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5886 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5887 SmallVector<int, 8> MaskOps; 5888 for (unsigned i = 0; i != 4; ++i) { 5889 if (Locs[i].first == -1) { 5890 MaskOps.push_back(-1); 5891 } else { 5892 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5893 MaskOps.push_back(Idx); 5894 } 5895 } 5896 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5897} 5898 5899static bool MayFoldVectorLoad(SDValue V) { 5900 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5901 V = V.getOperand(0); 5902 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5903 V = V.getOperand(0); 5904 if (MayFoldLoad(V)) 5905 return true; 5906 return false; 5907} 5908 5909// FIXME: the version above should always be used. Since there's 5910// a bug where several vector shuffles can't be folded because the 5911// DAG is not updated during lowering and a node claims to have two 5912// uses while it only has one, use this version, and let isel match 5913// another instruction if the load really happens to have more than 5914// one use. Remove this version after this bug get fixed. 5915// rdar://8434668, PR8156 5916static bool RelaxedMayFoldVectorLoad(SDValue V) { 5917 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5918 V = V.getOperand(0); 5919 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5920 V = V.getOperand(0); 5921 if (ISD::isNormalLoad(V.getNode())) 5922 return true; 5923 return false; 5924} 5925 5926/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5927/// a vector extract, and if both can be later optimized into a single load. 5928/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5929/// here because otherwise a target specific shuffle node is going to be 5930/// emitted for this shuffle, and the optimization not done. 5931/// FIXME: This is probably not the best approach, but fix the problem 5932/// until the right path is decided. 5933static 5934bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5935 const TargetLowering &TLI) { 5936 EVT VT = V.getValueType(); 5937 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5938 5939 // Be sure that the vector shuffle is present in a pattern like this: 5940 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5941 if (!V.hasOneUse()) 5942 return false; 5943 5944 SDNode *N = *V.getNode()->use_begin(); 5945 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5946 return false; 5947 5948 SDValue EltNo = N->getOperand(1); 5949 if (!isa<ConstantSDNode>(EltNo)) 5950 return false; 5951 5952 // If the bit convert changed the number of elements, it is unsafe 5953 // to examine the mask. 5954 bool HasShuffleIntoBitcast = false; 5955 if (V.getOpcode() == ISD::BITCAST) { 5956 EVT SrcVT = V.getOperand(0).getValueType(); 5957 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5958 return false; 5959 V = V.getOperand(0); 5960 HasShuffleIntoBitcast = true; 5961 } 5962 5963 // Select the input vector, guarding against out of range extract vector. 5964 unsigned NumElems = VT.getVectorNumElements(); 5965 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5966 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5967 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5968 5969 // Skip one more bit_convert if necessary 5970 if (V.getOpcode() == ISD::BITCAST) 5971 V = V.getOperand(0); 5972 5973 if (ISD::isNormalLoad(V.getNode())) { 5974 // Is the original load suitable? 5975 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5976 5977 // FIXME: avoid the multi-use bug that is preventing lots of 5978 // of foldings to be detected, this is still wrong of course, but 5979 // give the temporary desired behavior, and if it happens that 5980 // the load has real more uses, during isel it will not fold, and 5981 // will generate poor code. 5982 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5983 return false; 5984 5985 if (!HasShuffleIntoBitcast) 5986 return true; 5987 5988 // If there's a bitcast before the shuffle, check if the load type and 5989 // alignment is valid. 5990 unsigned Align = LN0->getAlignment(); 5991 unsigned NewAlign = 5992 TLI.getTargetData()->getABITypeAlignment( 5993 VT.getTypeForEVT(*DAG.getContext())); 5994 5995 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5996 return false; 5997 } 5998 5999 return true; 6000} 6001 6002static 6003SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6004 EVT VT = Op.getValueType(); 6005 6006 // Canonizalize to v2f64. 6007 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6008 return DAG.getNode(ISD::BITCAST, dl, VT, 6009 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6010 V1, DAG)); 6011} 6012 6013static 6014SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6015 bool HasSSE2) { 6016 SDValue V1 = Op.getOperand(0); 6017 SDValue V2 = Op.getOperand(1); 6018 EVT VT = Op.getValueType(); 6019 6020 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6021 6022 if (HasSSE2 && VT == MVT::v2f64) 6023 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6024 6025 // v4f32 or v4i32 6026 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 6027} 6028 6029static 6030SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6031 SDValue V1 = Op.getOperand(0); 6032 SDValue V2 = Op.getOperand(1); 6033 EVT VT = Op.getValueType(); 6034 6035 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6036 "unsupported shuffle type"); 6037 6038 if (V2.getOpcode() == ISD::UNDEF) 6039 V2 = V1; 6040 6041 // v4i32 or v4f32 6042 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6043} 6044 6045static 6046SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6047 SDValue V1 = Op.getOperand(0); 6048 SDValue V2 = Op.getOperand(1); 6049 EVT VT = Op.getValueType(); 6050 unsigned NumElems = VT.getVectorNumElements(); 6051 6052 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6053 // operand of these instructions is only memory, so check if there's a 6054 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6055 // same masks. 6056 bool CanFoldLoad = false; 6057 6058 // Trivial case, when V2 comes from a load. 6059 if (MayFoldVectorLoad(V2)) 6060 CanFoldLoad = true; 6061 6062 // When V1 is a load, it can be folded later into a store in isel, example: 6063 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6064 // turns into: 6065 // (MOVLPSmr addr:$src1, VR128:$src2) 6066 // So, recognize this potential and also use MOVLPS or MOVLPD 6067 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6068 CanFoldLoad = true; 6069 6070 // Both of them can't be memory operations though. 6071 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 6072 CanFoldLoad = false; 6073 6074 if (CanFoldLoad) { 6075 if (HasSSE2 && NumElems == 2) 6076 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6077 6078 if (NumElems == 4) 6079 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6080 } 6081 6082 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6083 // movl and movlp will both match v2i64, but v2i64 is never matched by 6084 // movl earlier because we make it strict to avoid messing with the movlp load 6085 // folding logic (see the code above getMOVLP call). Match it here then, 6086 // this is horrible, but will stay like this until we move all shuffle 6087 // matching to x86 specific nodes. Note that for the 1st condition all 6088 // types are matched with movsd. 6089 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 6090 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6091 else if (HasSSE2) 6092 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6093 6094 6095 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6096 6097 // Invert the operand order and use SHUFPS to match it. 6098 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 6099 X86::getShuffleSHUFImmediate(SVOp), DAG); 6100} 6101 6102static inline unsigned getUNPCKLOpcode(EVT VT) { 6103 switch(VT.getSimpleVT().SimpleTy) { 6104 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 6105 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 6106 case MVT::v4f32: return X86ISD::UNPCKLPS; 6107 case MVT::v2f64: return X86ISD::UNPCKLPD; 6108 case MVT::v8i32: // Use fp unit for int unpack. 6109 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 6110 case MVT::v4i64: // Use fp unit for int unpack. 6111 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 6112 case MVT::v16i8: return X86ISD::PUNPCKLBW; 6113 case MVT::v8i16: return X86ISD::PUNPCKLWD; 6114 default: 6115 llvm_unreachable("Unknown type for unpckl"); 6116 } 6117 return 0; 6118} 6119 6120static inline unsigned getUNPCKHOpcode(EVT VT) { 6121 switch(VT.getSimpleVT().SimpleTy) { 6122 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 6123 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 6124 case MVT::v4f32: return X86ISD::UNPCKHPS; 6125 case MVT::v2f64: return X86ISD::UNPCKHPD; 6126 case MVT::v8i32: // Use fp unit for int unpack. 6127 case MVT::v8f32: return X86ISD::VUNPCKHPSY; 6128 case MVT::v4i64: // Use fp unit for int unpack. 6129 case MVT::v4f64: return X86ISD::VUNPCKHPDY; 6130 case MVT::v16i8: return X86ISD::PUNPCKHBW; 6131 case MVT::v8i16: return X86ISD::PUNPCKHWD; 6132 default: 6133 llvm_unreachable("Unknown type for unpckh"); 6134 } 6135 return 0; 6136} 6137 6138static inline unsigned getVPERMILOpcode(EVT VT) { 6139 switch(VT.getSimpleVT().SimpleTy) { 6140 case MVT::v4i32: 6141 case MVT::v4f32: return X86ISD::VPERMILPS; 6142 case MVT::v2i64: 6143 case MVT::v2f64: return X86ISD::VPERMILPD; 6144 case MVT::v8i32: 6145 case MVT::v8f32: return X86ISD::VPERMILPSY; 6146 case MVT::v4i64: 6147 case MVT::v4f64: return X86ISD::VPERMILPDY; 6148 default: 6149 llvm_unreachable("Unknown type for vpermil"); 6150 } 6151 return 0; 6152} 6153 6154/// isVectorBroadcast - Check if the node chain is suitable to be xformed to 6155/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming 6156/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. 6157static bool isVectorBroadcast(SDValue &Op) { 6158 EVT VT = Op.getValueType(); 6159 bool Is256 = VT.getSizeInBits() == 256; 6160 6161 assert((VT.getSizeInBits() == 128 || Is256) && 6162 "Unsupported type for vbroadcast node"); 6163 6164 SDValue V = Op; 6165 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6166 V = V.getOperand(0); 6167 6168 if (Is256 && !(V.hasOneUse() && 6169 V.getOpcode() == ISD::INSERT_SUBVECTOR && 6170 V.getOperand(0).getOpcode() == ISD::UNDEF)) 6171 return false; 6172 6173 if (Is256) 6174 V = V.getOperand(1); 6175 if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR) 6176 return false; 6177 6178 // Check the source scalar_to_vector type. 256-bit broadcasts are 6179 // supported for 32/64-bit sizes, while 128-bit ones are only supported 6180 // for 32-bit scalars. 6181 unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); 6182 if (ScalarSize != 32 && ScalarSize != 64) 6183 return false; 6184 if (!Is256 && ScalarSize == 64) 6185 return false; 6186 6187 V = V.getOperand(0); 6188 if (!MayFoldLoad(V)) 6189 return false; 6190 6191 // Return the load node 6192 Op = V; 6193 return true; 6194} 6195 6196static 6197SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 6198 const TargetLowering &TLI, 6199 const X86Subtarget *Subtarget) { 6200 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6201 EVT VT = Op.getValueType(); 6202 DebugLoc dl = Op.getDebugLoc(); 6203 SDValue V1 = Op.getOperand(0); 6204 SDValue V2 = Op.getOperand(1); 6205 6206 if (isZeroShuffle(SVOp)) 6207 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 6208 6209 // Handle splat operations 6210 if (SVOp->isSplat()) { 6211 unsigned NumElem = VT.getVectorNumElements(); 6212 // Special case, this is the only place now where it's allowed to return 6213 // a vector_shuffle operation without using a target specific node, because 6214 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 6215 // this be moved to DAGCombine instead? 6216 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 6217 return Op; 6218 6219 // Use vbroadcast whenever the splat comes from a foldable load 6220 if (Subtarget->hasAVX() && isVectorBroadcast(V1)) 6221 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); 6222 6223 // Handle splats by matching through known shuffle masks 6224 if (VT.is128BitVector() && NumElem <= 4) 6225 return SDValue(); 6226 6227 // All remaning splats are promoted to target supported vector shuffles. 6228 return PromoteSplat(SVOp, DAG); 6229 } 6230 6231 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6232 // do it! 6233 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 6234 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6235 if (NewOp.getNode()) 6236 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6237 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6238 // FIXME: Figure out a cleaner way to do this. 6239 // Try to make use of movq to zero out the top part. 6240 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6241 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6242 if (NewOp.getNode()) { 6243 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 6244 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 6245 DAG, Subtarget, dl); 6246 } 6247 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6248 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6249 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 6250 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 6251 DAG, Subtarget, dl); 6252 } 6253 } 6254 return SDValue(); 6255} 6256 6257SDValue 6258X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6259 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6260 SDValue V1 = Op.getOperand(0); 6261 SDValue V2 = Op.getOperand(1); 6262 EVT VT = Op.getValueType(); 6263 DebugLoc dl = Op.getDebugLoc(); 6264 unsigned NumElems = VT.getVectorNumElements(); 6265 bool isMMX = VT.getSizeInBits() == 64; 6266 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6267 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6268 bool V1IsSplat = false; 6269 bool V2IsSplat = false; 6270 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 6271 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 6272 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 6273 MachineFunction &MF = DAG.getMachineFunction(); 6274 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6275 6276 // Shuffle operations on MMX not supported. 6277 if (isMMX) 6278 return Op; 6279 6280 // Vector shuffle lowering takes 3 steps: 6281 // 6282 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6283 // narrowing and commutation of operands should be handled. 6284 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6285 // shuffle nodes. 6286 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6287 // so the shuffle can be broken into other shuffles and the legalizer can 6288 // try the lowering again. 6289 // 6290 // The general ideia is that no vector_shuffle operation should be left to 6291 // be matched during isel, all of them must be converted to a target specific 6292 // node here. 6293 6294 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6295 // narrowing and commutation of operands should be handled. The actual code 6296 // doesn't include all of those, work in progress... 6297 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 6298 if (NewOp.getNode()) 6299 return NewOp; 6300 6301 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6302 // unpckh_undef). Only use pshufd if speed is more important than size. 6303 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 6304 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6305 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 6306 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6307 6308 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 6309 RelaxedMayFoldVectorLoad(V1)) 6310 return getMOVDDup(Op, dl, V1, DAG); 6311 6312 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 6313 return getMOVHighToLow(Op, dl, DAG); 6314 6315 // Use to match splats 6316 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 6317 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6318 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6319 6320 if (X86::isPSHUFDMask(SVOp)) { 6321 // The actual implementation will match the mask in the if above and then 6322 // during isel it can match several different instructions, not only pshufd 6323 // as its name says, sad but true, emulate the behavior for now... 6324 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6325 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6326 6327 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6328 6329 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6330 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6331 6332 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6333 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 6334 TargetMask, DAG); 6335 6336 if (VT == MVT::v4f32) 6337 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 6338 TargetMask, DAG); 6339 } 6340 6341 // Check if this can be converted into a logical shift. 6342 bool isLeft = false; 6343 unsigned ShAmt = 0; 6344 SDValue ShVal; 6345 bool isShift = getSubtarget()->hasSSE2() && 6346 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6347 if (isShift && ShVal.hasOneUse()) { 6348 // If the shifted value has multiple uses, it may be cheaper to use 6349 // v_set0 + movlhps or movhlps, etc. 6350 EVT EltVT = VT.getVectorElementType(); 6351 ShAmt *= EltVT.getSizeInBits(); 6352 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6353 } 6354 6355 if (X86::isMOVLMask(SVOp)) { 6356 if (V1IsUndef) 6357 return V2; 6358 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6359 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6360 if (!X86::isMOVLPMask(SVOp)) { 6361 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6362 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6363 6364 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6365 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6366 } 6367 } 6368 6369 // FIXME: fold these into legal mask. 6370 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 6371 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6372 6373 if (X86::isMOVHLPSMask(SVOp)) 6374 return getMOVHighToLow(Op, dl, DAG); 6375 6376 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 6377 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6378 6379 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 6380 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6381 6382 if (X86::isMOVLPMask(SVOp)) 6383 return getMOVLP(Op, dl, DAG, HasSSE2); 6384 6385 if (ShouldXformToMOVHLPS(SVOp) || 6386 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 6387 return CommuteVectorShuffle(SVOp, DAG); 6388 6389 if (isShift) { 6390 // No better options. Use a vshl / vsrl. 6391 EVT EltVT = VT.getVectorElementType(); 6392 ShAmt *= EltVT.getSizeInBits(); 6393 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6394 } 6395 6396 bool Commuted = false; 6397 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6398 // 1,1,1,1 -> v8i16 though. 6399 V1IsSplat = isSplatVector(V1.getNode()); 6400 V2IsSplat = isSplatVector(V2.getNode()); 6401 6402 // Canonicalize the splat or undef, if present, to be on the RHS. 6403 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 6404 Op = CommuteVectorShuffle(SVOp, DAG); 6405 SVOp = cast<ShuffleVectorSDNode>(Op); 6406 V1 = SVOp->getOperand(0); 6407 V2 = SVOp->getOperand(1); 6408 std::swap(V1IsSplat, V2IsSplat); 6409 std::swap(V1IsUndef, V2IsUndef); 6410 Commuted = true; 6411 } 6412 6413 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 6414 // Shuffling low element of v1 into undef, just return v1. 6415 if (V2IsUndef) 6416 return V1; 6417 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6418 // the instruction selector will not match, so get a canonical MOVL with 6419 // swapped operands to undo the commute. 6420 return getMOVL(DAG, dl, VT, V2, V1); 6421 } 6422 6423 if (X86::isUNPCKLMask(SVOp)) 6424 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 6425 6426 if (X86::isUNPCKHMask(SVOp)) 6427 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 6428 6429 if (V2IsSplat) { 6430 // Normalize mask so all entries that point to V2 points to its first 6431 // element then try to match unpck{h|l} again. If match, return a 6432 // new vector_shuffle with the corrected mask. 6433 SDValue NewMask = NormalizeMask(SVOp, DAG); 6434 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6435 if (NSVOp != SVOp) { 6436 if (X86::isUNPCKLMask(NSVOp, true)) { 6437 return NewMask; 6438 } else if (X86::isUNPCKHMask(NSVOp, true)) { 6439 return NewMask; 6440 } 6441 } 6442 } 6443 6444 if (Commuted) { 6445 // Commute is back and try unpck* again. 6446 // FIXME: this seems wrong. 6447 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6448 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6449 6450 if (X86::isUNPCKLMask(NewSVOp)) 6451 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 6452 6453 if (X86::isUNPCKHMask(NewSVOp)) 6454 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 6455 } 6456 6457 // Normalize the node to match x86 shuffle ops if needed 6458 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 6459 return CommuteVectorShuffle(SVOp, DAG); 6460 6461 // The checks below are all present in isShuffleMaskLegal, but they are 6462 // inlined here right now to enable us to directly emit target specific 6463 // nodes, and remove one by one until they don't return Op anymore. 6464 SmallVector<int, 16> M; 6465 SVOp->getMask(M); 6466 6467 if (isPALIGNRMask(M, VT, HasSSSE3)) 6468 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6469 X86::getShufflePALIGNRImmediate(SVOp), 6470 DAG); 6471 6472 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6473 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6474 if (VT == MVT::v2f64) 6475 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 6476 if (VT == MVT::v2i64) 6477 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 6478 } 6479 6480 if (isPSHUFHWMask(M, VT)) 6481 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6482 X86::getShufflePSHUFHWImmediate(SVOp), 6483 DAG); 6484 6485 if (isPSHUFLWMask(M, VT)) 6486 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6487 X86::getShufflePSHUFLWImmediate(SVOp), 6488 DAG); 6489 6490 if (isSHUFPMask(M, VT)) { 6491 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6492 if (VT == MVT::v4f32 || VT == MVT::v4i32) 6493 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 6494 TargetMask, DAG); 6495 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6496 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 6497 TargetMask, DAG); 6498 } 6499 6500 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 6501 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6502 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 6503 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6504 6505 //===--------------------------------------------------------------------===// 6506 // Generate target specific nodes for 128 or 256-bit shuffles only 6507 // supported in the AVX instruction set. 6508 // 6509 6510 // Handle VPERMILPS* permutations 6511 if (isVPERMILPSMask(M, VT, Subtarget)) 6512 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6513 getShuffleVPERMILPSImmediate(SVOp), DAG); 6514 6515 // Handle VPERMILPD* permutations 6516 if (isVPERMILPDMask(M, VT, Subtarget)) 6517 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6518 getShuffleVPERMILPDImmediate(SVOp), DAG); 6519 6520 // Handle VPERM2F128 permutations 6521 if (isVPERM2F128Mask(M, VT, Subtarget)) 6522 return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, 6523 getShuffleVPERM2F128Immediate(SVOp), DAG); 6524 6525 //===--------------------------------------------------------------------===// 6526 // Since no target specific shuffle was selected for this generic one, 6527 // lower it into other known shuffles. FIXME: this isn't true yet, but 6528 // this is the plan. 6529 // 6530 6531 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6532 if (VT == MVT::v8i16) { 6533 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6534 if (NewOp.getNode()) 6535 return NewOp; 6536 } 6537 6538 if (VT == MVT::v16i8) { 6539 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6540 if (NewOp.getNode()) 6541 return NewOp; 6542 } 6543 6544 // Handle all 128-bit wide vectors with 4 elements, and match them with 6545 // several different shuffle types. 6546 if (NumElems == 4 && VT.getSizeInBits() == 128) 6547 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6548 6549 // Handle general 256-bit shuffles 6550 if (VT.is256BitVector()) 6551 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6552 6553 return SDValue(); 6554} 6555 6556SDValue 6557X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6558 SelectionDAG &DAG) const { 6559 EVT VT = Op.getValueType(); 6560 DebugLoc dl = Op.getDebugLoc(); 6561 6562 if (Op.getOperand(0).getValueType().getSizeInBits() != 128) 6563 return SDValue(); 6564 6565 if (VT.getSizeInBits() == 8) { 6566 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6567 Op.getOperand(0), Op.getOperand(1)); 6568 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6569 DAG.getValueType(VT)); 6570 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6571 } else if (VT.getSizeInBits() == 16) { 6572 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6573 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6574 if (Idx == 0) 6575 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6576 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6577 DAG.getNode(ISD::BITCAST, dl, 6578 MVT::v4i32, 6579 Op.getOperand(0)), 6580 Op.getOperand(1))); 6581 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6582 Op.getOperand(0), Op.getOperand(1)); 6583 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6584 DAG.getValueType(VT)); 6585 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6586 } else if (VT == MVT::f32) { 6587 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6588 // the result back to FR32 register. It's only worth matching if the 6589 // result has a single use which is a store or a bitcast to i32. And in 6590 // the case of a store, it's not worth it if the index is a constant 0, 6591 // because a MOVSSmr can be used instead, which is smaller and faster. 6592 if (!Op.hasOneUse()) 6593 return SDValue(); 6594 SDNode *User = *Op.getNode()->use_begin(); 6595 if ((User->getOpcode() != ISD::STORE || 6596 (isa<ConstantSDNode>(Op.getOperand(1)) && 6597 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6598 (User->getOpcode() != ISD::BITCAST || 6599 User->getValueType(0) != MVT::i32)) 6600 return SDValue(); 6601 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6602 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6603 Op.getOperand(0)), 6604 Op.getOperand(1)); 6605 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6606 } else if (VT == MVT::i32) { 6607 // ExtractPS works with constant index. 6608 if (isa<ConstantSDNode>(Op.getOperand(1))) 6609 return Op; 6610 } 6611 return SDValue(); 6612} 6613 6614 6615SDValue 6616X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6617 SelectionDAG &DAG) const { 6618 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6619 return SDValue(); 6620 6621 SDValue Vec = Op.getOperand(0); 6622 EVT VecVT = Vec.getValueType(); 6623 6624 // If this is a 256-bit vector result, first extract the 128-bit vector and 6625 // then extract the element from the 128-bit vector. 6626 if (VecVT.getSizeInBits() == 256) { 6627 DebugLoc dl = Op.getNode()->getDebugLoc(); 6628 unsigned NumElems = VecVT.getVectorNumElements(); 6629 SDValue Idx = Op.getOperand(1); 6630 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6631 6632 // Get the 128-bit vector. 6633 bool Upper = IdxVal >= NumElems/2; 6634 Vec = Extract128BitVector(Vec, 6635 DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); 6636 6637 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6638 Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); 6639 } 6640 6641 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6642 6643 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { 6644 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6645 if (Res.getNode()) 6646 return Res; 6647 } 6648 6649 EVT VT = Op.getValueType(); 6650 DebugLoc dl = Op.getDebugLoc(); 6651 // TODO: handle v16i8. 6652 if (VT.getSizeInBits() == 16) { 6653 SDValue Vec = Op.getOperand(0); 6654 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6655 if (Idx == 0) 6656 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6657 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6658 DAG.getNode(ISD::BITCAST, dl, 6659 MVT::v4i32, Vec), 6660 Op.getOperand(1))); 6661 // Transform it so it match pextrw which produces a 32-bit result. 6662 EVT EltVT = MVT::i32; 6663 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6664 Op.getOperand(0), Op.getOperand(1)); 6665 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6666 DAG.getValueType(VT)); 6667 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6668 } else if (VT.getSizeInBits() == 32) { 6669 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6670 if (Idx == 0) 6671 return Op; 6672 6673 // SHUFPS the element to the lowest double word, then movss. 6674 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6675 EVT VVT = Op.getOperand(0).getValueType(); 6676 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6677 DAG.getUNDEF(VVT), Mask); 6678 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6679 DAG.getIntPtrConstant(0)); 6680 } else if (VT.getSizeInBits() == 64) { 6681 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6682 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6683 // to match extract_elt for f64. 6684 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6685 if (Idx == 0) 6686 return Op; 6687 6688 // UNPCKHPD the element to the lowest double word, then movsd. 6689 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6690 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6691 int Mask[2] = { 1, -1 }; 6692 EVT VVT = Op.getOperand(0).getValueType(); 6693 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6694 DAG.getUNDEF(VVT), Mask); 6695 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6696 DAG.getIntPtrConstant(0)); 6697 } 6698 6699 return SDValue(); 6700} 6701 6702SDValue 6703X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6704 SelectionDAG &DAG) const { 6705 EVT VT = Op.getValueType(); 6706 EVT EltVT = VT.getVectorElementType(); 6707 DebugLoc dl = Op.getDebugLoc(); 6708 6709 SDValue N0 = Op.getOperand(0); 6710 SDValue N1 = Op.getOperand(1); 6711 SDValue N2 = Op.getOperand(2); 6712 6713 if (VT.getSizeInBits() == 256) 6714 return SDValue(); 6715 6716 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6717 isa<ConstantSDNode>(N2)) { 6718 unsigned Opc; 6719 if (VT == MVT::v8i16) 6720 Opc = X86ISD::PINSRW; 6721 else if (VT == MVT::v16i8) 6722 Opc = X86ISD::PINSRB; 6723 else 6724 Opc = X86ISD::PINSRB; 6725 6726 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6727 // argument. 6728 if (N1.getValueType() != MVT::i32) 6729 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6730 if (N2.getValueType() != MVT::i32) 6731 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6732 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6733 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6734 // Bits [7:6] of the constant are the source select. This will always be 6735 // zero here. The DAG Combiner may combine an extract_elt index into these 6736 // bits. For example (insert (extract, 3), 2) could be matched by putting 6737 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6738 // Bits [5:4] of the constant are the destination select. This is the 6739 // value of the incoming immediate. 6740 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6741 // combine either bitwise AND or insert of float 0.0 to set these bits. 6742 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6743 // Create this as a scalar to vector.. 6744 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6745 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6746 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6747 // PINSR* works with constant index. 6748 return Op; 6749 } 6750 return SDValue(); 6751} 6752 6753SDValue 6754X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6755 EVT VT = Op.getValueType(); 6756 EVT EltVT = VT.getVectorElementType(); 6757 6758 DebugLoc dl = Op.getDebugLoc(); 6759 SDValue N0 = Op.getOperand(0); 6760 SDValue N1 = Op.getOperand(1); 6761 SDValue N2 = Op.getOperand(2); 6762 6763 // If this is a 256-bit vector result, first extract the 128-bit vector, 6764 // insert the element into the extracted half and then place it back. 6765 if (VT.getSizeInBits() == 256) { 6766 if (!isa<ConstantSDNode>(N2)) 6767 return SDValue(); 6768 6769 // Get the desired 128-bit vector half. 6770 unsigned NumElems = VT.getVectorNumElements(); 6771 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6772 bool Upper = IdxVal >= NumElems/2; 6773 SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); 6774 SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); 6775 6776 // Insert the element into the desired half. 6777 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, 6778 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); 6779 6780 // Insert the changed part back to the 256-bit vector 6781 return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); 6782 } 6783 6784 if (Subtarget->hasSSE41() || Subtarget->hasAVX()) 6785 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6786 6787 if (EltVT == MVT::i8) 6788 return SDValue(); 6789 6790 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6791 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6792 // as its second argument. 6793 if (N1.getValueType() != MVT::i32) 6794 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6795 if (N2.getValueType() != MVT::i32) 6796 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6797 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6798 } 6799 return SDValue(); 6800} 6801 6802SDValue 6803X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6804 LLVMContext *Context = DAG.getContext(); 6805 DebugLoc dl = Op.getDebugLoc(); 6806 EVT OpVT = Op.getValueType(); 6807 6808 // If this is a 256-bit vector result, first insert into a 128-bit 6809 // vector and then insert into the 256-bit vector. 6810 if (OpVT.getSizeInBits() > 128) { 6811 // Insert into a 128-bit vector. 6812 EVT VT128 = EVT::getVectorVT(*Context, 6813 OpVT.getVectorElementType(), 6814 OpVT.getVectorNumElements() / 2); 6815 6816 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6817 6818 // Insert the 128-bit vector. 6819 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6820 DAG.getConstant(0, MVT::i32), 6821 DAG, dl); 6822 } 6823 6824 if (Op.getValueType() == MVT::v1i64 && 6825 Op.getOperand(0).getValueType() == MVT::i64) 6826 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6827 6828 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6829 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6830 "Expected an SSE type!"); 6831 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6832 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6833} 6834 6835// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6836// a simple subregister reference or explicit instructions to grab 6837// upper bits of a vector. 6838SDValue 6839X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6840 if (Subtarget->hasAVX()) { 6841 DebugLoc dl = Op.getNode()->getDebugLoc(); 6842 SDValue Vec = Op.getNode()->getOperand(0); 6843 SDValue Idx = Op.getNode()->getOperand(1); 6844 6845 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6846 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6847 return Extract128BitVector(Vec, Idx, DAG, dl); 6848 } 6849 } 6850 return SDValue(); 6851} 6852 6853// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6854// simple superregister reference or explicit instructions to insert 6855// the upper bits of a vector. 6856SDValue 6857X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6858 if (Subtarget->hasAVX()) { 6859 DebugLoc dl = Op.getNode()->getDebugLoc(); 6860 SDValue Vec = Op.getNode()->getOperand(0); 6861 SDValue SubVec = Op.getNode()->getOperand(1); 6862 SDValue Idx = Op.getNode()->getOperand(2); 6863 6864 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6865 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6866 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6867 } 6868 } 6869 return SDValue(); 6870} 6871 6872// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6873// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6874// one of the above mentioned nodes. It has to be wrapped because otherwise 6875// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6876// be used to form addressing mode. These wrapped nodes will be selected 6877// into MOV32ri. 6878SDValue 6879X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6880 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6881 6882 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6883 // global base reg. 6884 unsigned char OpFlag = 0; 6885 unsigned WrapperKind = X86ISD::Wrapper; 6886 CodeModel::Model M = getTargetMachine().getCodeModel(); 6887 6888 if (Subtarget->isPICStyleRIPRel() && 6889 (M == CodeModel::Small || M == CodeModel::Kernel)) 6890 WrapperKind = X86ISD::WrapperRIP; 6891 else if (Subtarget->isPICStyleGOT()) 6892 OpFlag = X86II::MO_GOTOFF; 6893 else if (Subtarget->isPICStyleStubPIC()) 6894 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6895 6896 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6897 CP->getAlignment(), 6898 CP->getOffset(), OpFlag); 6899 DebugLoc DL = CP->getDebugLoc(); 6900 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6901 // With PIC, the address is actually $g + Offset. 6902 if (OpFlag) { 6903 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6904 DAG.getNode(X86ISD::GlobalBaseReg, 6905 DebugLoc(), getPointerTy()), 6906 Result); 6907 } 6908 6909 return Result; 6910} 6911 6912SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6913 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6914 6915 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6916 // global base reg. 6917 unsigned char OpFlag = 0; 6918 unsigned WrapperKind = X86ISD::Wrapper; 6919 CodeModel::Model M = getTargetMachine().getCodeModel(); 6920 6921 if (Subtarget->isPICStyleRIPRel() && 6922 (M == CodeModel::Small || M == CodeModel::Kernel)) 6923 WrapperKind = X86ISD::WrapperRIP; 6924 else if (Subtarget->isPICStyleGOT()) 6925 OpFlag = X86II::MO_GOTOFF; 6926 else if (Subtarget->isPICStyleStubPIC()) 6927 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6928 6929 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6930 OpFlag); 6931 DebugLoc DL = JT->getDebugLoc(); 6932 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6933 6934 // With PIC, the address is actually $g + Offset. 6935 if (OpFlag) 6936 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6937 DAG.getNode(X86ISD::GlobalBaseReg, 6938 DebugLoc(), getPointerTy()), 6939 Result); 6940 6941 return Result; 6942} 6943 6944SDValue 6945X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6946 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6947 6948 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6949 // global base reg. 6950 unsigned char OpFlag = 0; 6951 unsigned WrapperKind = X86ISD::Wrapper; 6952 CodeModel::Model M = getTargetMachine().getCodeModel(); 6953 6954 if (Subtarget->isPICStyleRIPRel() && 6955 (M == CodeModel::Small || M == CodeModel::Kernel)) { 6956 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 6957 OpFlag = X86II::MO_GOTPCREL; 6958 WrapperKind = X86ISD::WrapperRIP; 6959 } else if (Subtarget->isPICStyleGOT()) { 6960 OpFlag = X86II::MO_GOT; 6961 } else if (Subtarget->isPICStyleStubPIC()) { 6962 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 6963 } else if (Subtarget->isPICStyleStubNoDynamic()) { 6964 OpFlag = X86II::MO_DARWIN_NONLAZY; 6965 } 6966 6967 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6968 6969 DebugLoc DL = Op.getDebugLoc(); 6970 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6971 6972 6973 // With PIC, the address is actually $g + Offset. 6974 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6975 !Subtarget->is64Bit()) { 6976 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6977 DAG.getNode(X86ISD::GlobalBaseReg, 6978 DebugLoc(), getPointerTy()), 6979 Result); 6980 } 6981 6982 // For symbols that require a load from a stub to get the address, emit the 6983 // load. 6984 if (isGlobalStubReference(OpFlag)) 6985 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 6986 MachinePointerInfo::getGOT(), false, false, 0); 6987 6988 return Result; 6989} 6990 6991SDValue 6992X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6993 // Create the TargetBlockAddressAddress node. 6994 unsigned char OpFlags = 6995 Subtarget->ClassifyBlockAddressReference(); 6996 CodeModel::Model M = getTargetMachine().getCodeModel(); 6997 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6998 DebugLoc dl = Op.getDebugLoc(); 6999 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 7000 /*isTarget=*/true, OpFlags); 7001 7002 if (Subtarget->isPICStyleRIPRel() && 7003 (M == CodeModel::Small || M == CodeModel::Kernel)) 7004 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7005 else 7006 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7007 7008 // With PIC, the address is actually $g + Offset. 7009 if (isGlobalRelativeToPICBase(OpFlags)) { 7010 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7011 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7012 Result); 7013 } 7014 7015 return Result; 7016} 7017 7018SDValue 7019X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7020 int64_t Offset, 7021 SelectionDAG &DAG) const { 7022 // Create the TargetGlobalAddress node, folding in the constant 7023 // offset if it is legal. 7024 unsigned char OpFlags = 7025 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7026 CodeModel::Model M = getTargetMachine().getCodeModel(); 7027 SDValue Result; 7028 if (OpFlags == X86II::MO_NO_FLAG && 7029 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7030 // A direct static reference to a global. 7031 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7032 Offset = 0; 7033 } else { 7034 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7035 } 7036 7037 if (Subtarget->isPICStyleRIPRel() && 7038 (M == CodeModel::Small || M == CodeModel::Kernel)) 7039 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7040 else 7041 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7042 7043 // With PIC, the address is actually $g + Offset. 7044 if (isGlobalRelativeToPICBase(OpFlags)) { 7045 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7046 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7047 Result); 7048 } 7049 7050 // For globals that require a load from a stub to get the address, emit the 7051 // load. 7052 if (isGlobalStubReference(OpFlags)) 7053 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7054 MachinePointerInfo::getGOT(), false, false, 0); 7055 7056 // If there was a non-zero offset that we didn't fold, create an explicit 7057 // addition for it. 7058 if (Offset != 0) 7059 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7060 DAG.getConstant(Offset, getPointerTy())); 7061 7062 return Result; 7063} 7064 7065SDValue 7066X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7067 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7068 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7069 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7070} 7071 7072static SDValue 7073GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7074 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7075 unsigned char OperandFlags) { 7076 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7077 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7078 DebugLoc dl = GA->getDebugLoc(); 7079 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7080 GA->getValueType(0), 7081 GA->getOffset(), 7082 OperandFlags); 7083 if (InFlag) { 7084 SDValue Ops[] = { Chain, TGA, *InFlag }; 7085 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 7086 } else { 7087 SDValue Ops[] = { Chain, TGA }; 7088 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 7089 } 7090 7091 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7092 MFI->setAdjustsStack(true); 7093 7094 SDValue Flag = Chain.getValue(1); 7095 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7096} 7097 7098// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7099static SDValue 7100LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7101 const EVT PtrVT) { 7102 SDValue InFlag; 7103 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7104 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7105 DAG.getNode(X86ISD::GlobalBaseReg, 7106 DebugLoc(), PtrVT), InFlag); 7107 InFlag = Chain.getValue(1); 7108 7109 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7110} 7111 7112// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7113static SDValue 7114LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7115 const EVT PtrVT) { 7116 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7117 X86::RAX, X86II::MO_TLSGD); 7118} 7119 7120// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 7121// "local exec" model. 7122static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7123 const EVT PtrVT, TLSModel::Model model, 7124 bool is64Bit) { 7125 DebugLoc dl = GA->getDebugLoc(); 7126 7127 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7128 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7129 is64Bit ? 257 : 256)); 7130 7131 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7132 DAG.getIntPtrConstant(0), 7133 MachinePointerInfo(Ptr), false, false, 0); 7134 7135 unsigned char OperandFlags = 0; 7136 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7137 // initialexec. 7138 unsigned WrapperKind = X86ISD::Wrapper; 7139 if (model == TLSModel::LocalExec) { 7140 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7141 } else if (is64Bit) { 7142 assert(model == TLSModel::InitialExec); 7143 OperandFlags = X86II::MO_GOTTPOFF; 7144 WrapperKind = X86ISD::WrapperRIP; 7145 } else { 7146 assert(model == TLSModel::InitialExec); 7147 OperandFlags = X86II::MO_INDNTPOFF; 7148 } 7149 7150 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 7151 // exec) 7152 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7153 GA->getValueType(0), 7154 GA->getOffset(), OperandFlags); 7155 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7156 7157 if (model == TLSModel::InitialExec) 7158 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7159 MachinePointerInfo::getGOT(), false, false, 0); 7160 7161 // The address of the thread local variable is the add of the thread 7162 // pointer with the offset of the variable. 7163 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7164} 7165 7166SDValue 7167X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7168 7169 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7170 const GlobalValue *GV = GA->getGlobal(); 7171 7172 if (Subtarget->isTargetELF()) { 7173 // TODO: implement the "local dynamic" model 7174 // TODO: implement the "initial exec"model for pic executables 7175 7176 // If GV is an alias then use the aliasee for determining 7177 // thread-localness. 7178 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7179 GV = GA->resolveAliasedGlobal(false); 7180 7181 TLSModel::Model model 7182 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 7183 7184 switch (model) { 7185 case TLSModel::GeneralDynamic: 7186 case TLSModel::LocalDynamic: // not implemented 7187 if (Subtarget->is64Bit()) 7188 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7189 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7190 7191 case TLSModel::InitialExec: 7192 case TLSModel::LocalExec: 7193 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7194 Subtarget->is64Bit()); 7195 } 7196 } else if (Subtarget->isTargetDarwin()) { 7197 // Darwin only has one model of TLS. Lower to that. 7198 unsigned char OpFlag = 0; 7199 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7200 X86ISD::WrapperRIP : X86ISD::Wrapper; 7201 7202 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7203 // global base reg. 7204 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7205 !Subtarget->is64Bit(); 7206 if (PIC32) 7207 OpFlag = X86II::MO_TLVP_PIC_BASE; 7208 else 7209 OpFlag = X86II::MO_TLVP; 7210 DebugLoc DL = Op.getDebugLoc(); 7211 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7212 GA->getValueType(0), 7213 GA->getOffset(), OpFlag); 7214 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7215 7216 // With PIC32, the address is actually $g + Offset. 7217 if (PIC32) 7218 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7219 DAG.getNode(X86ISD::GlobalBaseReg, 7220 DebugLoc(), getPointerTy()), 7221 Offset); 7222 7223 // Lowering the machine isd will make sure everything is in the right 7224 // location. 7225 SDValue Chain = DAG.getEntryNode(); 7226 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7227 SDValue Args[] = { Chain, Offset }; 7228 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7229 7230 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7231 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7232 MFI->setAdjustsStack(true); 7233 7234 // And our return value (tls address) is in the standard call return value 7235 // location. 7236 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7237 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 7238 } 7239 7240 assert(false && 7241 "TLS not implemented for this target."); 7242 7243 llvm_unreachable("Unreachable"); 7244 return SDValue(); 7245} 7246 7247 7248/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 7249/// take a 2 x i32 value to shift plus a shift amount. 7250SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 7251 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7252 EVT VT = Op.getValueType(); 7253 unsigned VTBits = VT.getSizeInBits(); 7254 DebugLoc dl = Op.getDebugLoc(); 7255 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7256 SDValue ShOpLo = Op.getOperand(0); 7257 SDValue ShOpHi = Op.getOperand(1); 7258 SDValue ShAmt = Op.getOperand(2); 7259 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7260 DAG.getConstant(VTBits - 1, MVT::i8)) 7261 : DAG.getConstant(0, VT); 7262 7263 SDValue Tmp2, Tmp3; 7264 if (Op.getOpcode() == ISD::SHL_PARTS) { 7265 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7266 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7267 } else { 7268 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7269 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7270 } 7271 7272 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7273 DAG.getConstant(VTBits, MVT::i8)); 7274 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7275 AndNode, DAG.getConstant(0, MVT::i8)); 7276 7277 SDValue Hi, Lo; 7278 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7279 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7280 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7281 7282 if (Op.getOpcode() == ISD::SHL_PARTS) { 7283 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7284 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7285 } else { 7286 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7287 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7288 } 7289 7290 SDValue Ops[2] = { Lo, Hi }; 7291 return DAG.getMergeValues(Ops, 2, dl); 7292} 7293 7294SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7295 SelectionDAG &DAG) const { 7296 EVT SrcVT = Op.getOperand(0).getValueType(); 7297 7298 if (SrcVT.isVector()) 7299 return SDValue(); 7300 7301 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7302 "Unknown SINT_TO_FP to lower!"); 7303 7304 // These are really Legal; return the operand so the caller accepts it as 7305 // Legal. 7306 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7307 return Op; 7308 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7309 Subtarget->is64Bit()) { 7310 return Op; 7311 } 7312 7313 DebugLoc dl = Op.getDebugLoc(); 7314 unsigned Size = SrcVT.getSizeInBits()/8; 7315 MachineFunction &MF = DAG.getMachineFunction(); 7316 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7317 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7318 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7319 StackSlot, 7320 MachinePointerInfo::getFixedStack(SSFI), 7321 false, false, 0); 7322 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7323} 7324 7325SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7326 SDValue StackSlot, 7327 SelectionDAG &DAG) const { 7328 // Build the FILD 7329 DebugLoc DL = Op.getDebugLoc(); 7330 SDVTList Tys; 7331 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7332 if (useSSE) 7333 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7334 else 7335 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7336 7337 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7338 7339 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7340 MachineMemOperand *MMO; 7341 if (FI) { 7342 int SSFI = FI->getIndex(); 7343 MMO = 7344 DAG.getMachineFunction() 7345 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7346 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7347 } else { 7348 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7349 StackSlot = StackSlot.getOperand(1); 7350 } 7351 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7352 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7353 X86ISD::FILD, DL, 7354 Tys, Ops, array_lengthof(Ops), 7355 SrcVT, MMO); 7356 7357 if (useSSE) { 7358 Chain = Result.getValue(1); 7359 SDValue InFlag = Result.getValue(2); 7360 7361 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7362 // shouldn't be necessary except that RFP cannot be live across 7363 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7364 MachineFunction &MF = DAG.getMachineFunction(); 7365 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7366 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7367 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7368 Tys = DAG.getVTList(MVT::Other); 7369 SDValue Ops[] = { 7370 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7371 }; 7372 MachineMemOperand *MMO = 7373 DAG.getMachineFunction() 7374 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7375 MachineMemOperand::MOStore, SSFISize, SSFISize); 7376 7377 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7378 Ops, array_lengthof(Ops), 7379 Op.getValueType(), MMO); 7380 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7381 MachinePointerInfo::getFixedStack(SSFI), 7382 false, false, 0); 7383 } 7384 7385 return Result; 7386} 7387 7388// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7389SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7390 SelectionDAG &DAG) const { 7391 // This algorithm is not obvious. Here it is in C code, more or less: 7392 /* 7393 double uint64_to_double( uint32_t hi, uint32_t lo ) { 7394 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 7395 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 7396 7397 // Copy ints to xmm registers. 7398 __m128i xh = _mm_cvtsi32_si128( hi ); 7399 __m128i xl = _mm_cvtsi32_si128( lo ); 7400 7401 // Combine into low half of a single xmm register. 7402 __m128i x = _mm_unpacklo_epi32( xh, xl ); 7403 __m128d d; 7404 double sd; 7405 7406 // Merge in appropriate exponents to give the integer bits the right 7407 // magnitude. 7408 x = _mm_unpacklo_epi32( x, exp ); 7409 7410 // Subtract away the biases to deal with the IEEE-754 double precision 7411 // implicit 1. 7412 d = _mm_sub_pd( (__m128d) x, bias ); 7413 7414 // All conversions up to here are exact. The correctly rounded result is 7415 // calculated using the current rounding mode using the following 7416 // horizontal add. 7417 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7418 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7419 // store doesn't really need to be here (except 7420 // maybe to zero the other double) 7421 return sd; 7422 } 7423 */ 7424 7425 DebugLoc dl = Op.getDebugLoc(); 7426 LLVMContext *Context = DAG.getContext(); 7427 7428 // Build some magic constants. 7429 std::vector<Constant*> CV0; 7430 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7431 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7432 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7433 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7434 Constant *C0 = ConstantVector::get(CV0); 7435 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7436 7437 std::vector<Constant*> CV1; 7438 CV1.push_back( 7439 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7440 CV1.push_back( 7441 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7442 Constant *C1 = ConstantVector::get(CV1); 7443 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7444 7445 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7446 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7447 Op.getOperand(0), 7448 DAG.getIntPtrConstant(1))); 7449 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7450 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7451 Op.getOperand(0), 7452 DAG.getIntPtrConstant(0))); 7453 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7454 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7455 MachinePointerInfo::getConstantPool(), 7456 false, false, 16); 7457 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7458 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7459 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7460 MachinePointerInfo::getConstantPool(), 7461 false, false, 16); 7462 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7463 7464 // Add the halves; easiest way is to swap them into another reg first. 7465 int ShufMask[2] = { 1, -1 }; 7466 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7467 DAG.getUNDEF(MVT::v2f64), ShufMask); 7468 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7469 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7470 DAG.getIntPtrConstant(0)); 7471} 7472 7473// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7474SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7475 SelectionDAG &DAG) const { 7476 DebugLoc dl = Op.getDebugLoc(); 7477 // FP constant to bias correct the final result. 7478 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7479 MVT::f64); 7480 7481 // Load the 32-bit value into an XMM register. 7482 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7483 Op.getOperand(0)); 7484 7485 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7486 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7487 DAG.getIntPtrConstant(0)); 7488 7489 // Or the load with the bias. 7490 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7491 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7492 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7493 MVT::v2f64, Load)), 7494 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7495 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7496 MVT::v2f64, Bias))); 7497 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7498 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7499 DAG.getIntPtrConstant(0)); 7500 7501 // Subtract the bias. 7502 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7503 7504 // Handle final rounding. 7505 EVT DestVT = Op.getValueType(); 7506 7507 if (DestVT.bitsLT(MVT::f64)) { 7508 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7509 DAG.getIntPtrConstant(0)); 7510 } else if (DestVT.bitsGT(MVT::f64)) { 7511 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7512 } 7513 7514 // Handle final rounding. 7515 return Sub; 7516} 7517 7518SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7519 SelectionDAG &DAG) const { 7520 SDValue N0 = Op.getOperand(0); 7521 DebugLoc dl = Op.getDebugLoc(); 7522 7523 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7524 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7525 // the optimization here. 7526 if (DAG.SignBitIsZero(N0)) 7527 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7528 7529 EVT SrcVT = N0.getValueType(); 7530 EVT DstVT = Op.getValueType(); 7531 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7532 return LowerUINT_TO_FP_i64(Op, DAG); 7533 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7534 return LowerUINT_TO_FP_i32(Op, DAG); 7535 7536 // Make a 64-bit buffer, and use it to build an FILD. 7537 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7538 if (SrcVT == MVT::i32) { 7539 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7540 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7541 getPointerTy(), StackSlot, WordOff); 7542 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7543 StackSlot, MachinePointerInfo(), 7544 false, false, 0); 7545 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7546 OffsetSlot, MachinePointerInfo(), 7547 false, false, 0); 7548 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7549 return Fild; 7550 } 7551 7552 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7553 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7554 StackSlot, MachinePointerInfo(), 7555 false, false, 0); 7556 // For i64 source, we need to add the appropriate power of 2 if the input 7557 // was negative. This is the same as the optimization in 7558 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7559 // we must be careful to do the computation in x87 extended precision, not 7560 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7561 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7562 MachineMemOperand *MMO = 7563 DAG.getMachineFunction() 7564 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7565 MachineMemOperand::MOLoad, 8, 8); 7566 7567 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7568 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7569 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7570 MVT::i64, MMO); 7571 7572 APInt FF(32, 0x5F800000ULL); 7573 7574 // Check whether the sign bit is set. 7575 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7576 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7577 ISD::SETLT); 7578 7579 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7580 SDValue FudgePtr = DAG.getConstantPool( 7581 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7582 getPointerTy()); 7583 7584 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7585 SDValue Zero = DAG.getIntPtrConstant(0); 7586 SDValue Four = DAG.getIntPtrConstant(4); 7587 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7588 Zero, Four); 7589 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7590 7591 // Load the value out, extending it from f32 to f80. 7592 // FIXME: Avoid the extend by constructing the right constant pool? 7593 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7594 FudgePtr, MachinePointerInfo::getConstantPool(), 7595 MVT::f32, false, false, 4); 7596 // Extend everything to 80 bits to force it to be done on x87. 7597 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7598 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7599} 7600 7601std::pair<SDValue,SDValue> X86TargetLowering:: 7602FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7603 DebugLoc DL = Op.getDebugLoc(); 7604 7605 EVT DstTy = Op.getValueType(); 7606 7607 if (!IsSigned) { 7608 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7609 DstTy = MVT::i64; 7610 } 7611 7612 assert(DstTy.getSimpleVT() <= MVT::i64 && 7613 DstTy.getSimpleVT() >= MVT::i16 && 7614 "Unknown FP_TO_SINT to lower!"); 7615 7616 // These are really Legal. 7617 if (DstTy == MVT::i32 && 7618 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7619 return std::make_pair(SDValue(), SDValue()); 7620 if (Subtarget->is64Bit() && 7621 DstTy == MVT::i64 && 7622 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7623 return std::make_pair(SDValue(), SDValue()); 7624 7625 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7626 // stack slot. 7627 MachineFunction &MF = DAG.getMachineFunction(); 7628 unsigned MemSize = DstTy.getSizeInBits()/8; 7629 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7630 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7631 7632 7633 7634 unsigned Opc; 7635 switch (DstTy.getSimpleVT().SimpleTy) { 7636 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7637 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7638 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7639 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7640 } 7641 7642 SDValue Chain = DAG.getEntryNode(); 7643 SDValue Value = Op.getOperand(0); 7644 EVT TheVT = Op.getOperand(0).getValueType(); 7645 if (isScalarFPTypeInSSEReg(TheVT)) { 7646 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7647 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7648 MachinePointerInfo::getFixedStack(SSFI), 7649 false, false, 0); 7650 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7651 SDValue Ops[] = { 7652 Chain, StackSlot, DAG.getValueType(TheVT) 7653 }; 7654 7655 MachineMemOperand *MMO = 7656 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7657 MachineMemOperand::MOLoad, MemSize, MemSize); 7658 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7659 DstTy, MMO); 7660 Chain = Value.getValue(1); 7661 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7662 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7663 } 7664 7665 MachineMemOperand *MMO = 7666 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7667 MachineMemOperand::MOStore, MemSize, MemSize); 7668 7669 // Build the FP_TO_INT*_IN_MEM 7670 SDValue Ops[] = { Chain, Value, StackSlot }; 7671 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7672 Ops, 3, DstTy, MMO); 7673 7674 return std::make_pair(FIST, StackSlot); 7675} 7676 7677SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7678 SelectionDAG &DAG) const { 7679 if (Op.getValueType().isVector()) 7680 return SDValue(); 7681 7682 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7683 SDValue FIST = Vals.first, StackSlot = Vals.second; 7684 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7685 if (FIST.getNode() == 0) return Op; 7686 7687 // Load the result. 7688 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7689 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7690} 7691 7692SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7693 SelectionDAG &DAG) const { 7694 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7695 SDValue FIST = Vals.first, StackSlot = Vals.second; 7696 assert(FIST.getNode() && "Unexpected failure"); 7697 7698 // Load the result. 7699 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7700 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7701} 7702 7703SDValue X86TargetLowering::LowerFABS(SDValue Op, 7704 SelectionDAG &DAG) const { 7705 LLVMContext *Context = DAG.getContext(); 7706 DebugLoc dl = Op.getDebugLoc(); 7707 EVT VT = Op.getValueType(); 7708 EVT EltVT = VT; 7709 if (VT.isVector()) 7710 EltVT = VT.getVectorElementType(); 7711 std::vector<Constant*> CV; 7712 if (EltVT == MVT::f64) { 7713 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7714 CV.push_back(C); 7715 CV.push_back(C); 7716 } else { 7717 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7718 CV.push_back(C); 7719 CV.push_back(C); 7720 CV.push_back(C); 7721 CV.push_back(C); 7722 } 7723 Constant *C = ConstantVector::get(CV); 7724 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7725 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7726 MachinePointerInfo::getConstantPool(), 7727 false, false, 16); 7728 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7729} 7730 7731SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7732 LLVMContext *Context = DAG.getContext(); 7733 DebugLoc dl = Op.getDebugLoc(); 7734 EVT VT = Op.getValueType(); 7735 EVT EltVT = VT; 7736 if (VT.isVector()) 7737 EltVT = VT.getVectorElementType(); 7738 std::vector<Constant*> CV; 7739 if (EltVT == MVT::f64) { 7740 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7741 CV.push_back(C); 7742 CV.push_back(C); 7743 } else { 7744 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7745 CV.push_back(C); 7746 CV.push_back(C); 7747 CV.push_back(C); 7748 CV.push_back(C); 7749 } 7750 Constant *C = ConstantVector::get(CV); 7751 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7752 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7753 MachinePointerInfo::getConstantPool(), 7754 false, false, 16); 7755 if (VT.isVector()) { 7756 return DAG.getNode(ISD::BITCAST, dl, VT, 7757 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7758 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7759 Op.getOperand(0)), 7760 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7761 } else { 7762 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7763 } 7764} 7765 7766SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7767 LLVMContext *Context = DAG.getContext(); 7768 SDValue Op0 = Op.getOperand(0); 7769 SDValue Op1 = Op.getOperand(1); 7770 DebugLoc dl = Op.getDebugLoc(); 7771 EVT VT = Op.getValueType(); 7772 EVT SrcVT = Op1.getValueType(); 7773 7774 // If second operand is smaller, extend it first. 7775 if (SrcVT.bitsLT(VT)) { 7776 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7777 SrcVT = VT; 7778 } 7779 // And if it is bigger, shrink it first. 7780 if (SrcVT.bitsGT(VT)) { 7781 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7782 SrcVT = VT; 7783 } 7784 7785 // At this point the operands and the result should have the same 7786 // type, and that won't be f80 since that is not custom lowered. 7787 7788 // First get the sign bit of second operand. 7789 std::vector<Constant*> CV; 7790 if (SrcVT == MVT::f64) { 7791 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7792 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7793 } else { 7794 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7795 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7796 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7797 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7798 } 7799 Constant *C = ConstantVector::get(CV); 7800 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7801 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7802 MachinePointerInfo::getConstantPool(), 7803 false, false, 16); 7804 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7805 7806 // Shift sign bit right or left if the two operands have different types. 7807 if (SrcVT.bitsGT(VT)) { 7808 // Op0 is MVT::f32, Op1 is MVT::f64. 7809 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7810 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7811 DAG.getConstant(32, MVT::i32)); 7812 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7813 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7814 DAG.getIntPtrConstant(0)); 7815 } 7816 7817 // Clear first operand sign bit. 7818 CV.clear(); 7819 if (VT == MVT::f64) { 7820 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7821 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7822 } else { 7823 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7824 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7825 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7826 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7827 } 7828 C = ConstantVector::get(CV); 7829 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7830 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7831 MachinePointerInfo::getConstantPool(), 7832 false, false, 16); 7833 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7834 7835 // Or the value with the sign bit. 7836 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7837} 7838 7839SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 7840 SDValue N0 = Op.getOperand(0); 7841 DebugLoc dl = Op.getDebugLoc(); 7842 EVT VT = Op.getValueType(); 7843 7844 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 7845 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 7846 DAG.getConstant(1, VT)); 7847 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 7848} 7849 7850/// Emit nodes that will be selected as "test Op0,Op0", or something 7851/// equivalent. 7852SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7853 SelectionDAG &DAG) const { 7854 DebugLoc dl = Op.getDebugLoc(); 7855 7856 // CF and OF aren't always set the way we want. Determine which 7857 // of these we need. 7858 bool NeedCF = false; 7859 bool NeedOF = false; 7860 switch (X86CC) { 7861 default: break; 7862 case X86::COND_A: case X86::COND_AE: 7863 case X86::COND_B: case X86::COND_BE: 7864 NeedCF = true; 7865 break; 7866 case X86::COND_G: case X86::COND_GE: 7867 case X86::COND_L: case X86::COND_LE: 7868 case X86::COND_O: case X86::COND_NO: 7869 NeedOF = true; 7870 break; 7871 } 7872 7873 // See if we can use the EFLAGS value from the operand instead of 7874 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7875 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7876 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7877 // Emit a CMP with 0, which is the TEST pattern. 7878 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7879 DAG.getConstant(0, Op.getValueType())); 7880 7881 unsigned Opcode = 0; 7882 unsigned NumOperands = 0; 7883 switch (Op.getNode()->getOpcode()) { 7884 case ISD::ADD: 7885 // Due to an isel shortcoming, be conservative if this add is likely to be 7886 // selected as part of a load-modify-store instruction. When the root node 7887 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7888 // uses of other nodes in the match, such as the ADD in this case. This 7889 // leads to the ADD being left around and reselected, with the result being 7890 // two adds in the output. Alas, even if none our users are stores, that 7891 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7892 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7893 // climbing the DAG back to the root, and it doesn't seem to be worth the 7894 // effort. 7895 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7896 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7897 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7898 goto default_case; 7899 7900 if (ConstantSDNode *C = 7901 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7902 // An add of one will be selected as an INC. 7903 if (C->getAPIntValue() == 1) { 7904 Opcode = X86ISD::INC; 7905 NumOperands = 1; 7906 break; 7907 } 7908 7909 // An add of negative one (subtract of one) will be selected as a DEC. 7910 if (C->getAPIntValue().isAllOnesValue()) { 7911 Opcode = X86ISD::DEC; 7912 NumOperands = 1; 7913 break; 7914 } 7915 } 7916 7917 // Otherwise use a regular EFLAGS-setting add. 7918 Opcode = X86ISD::ADD; 7919 NumOperands = 2; 7920 break; 7921 case ISD::AND: { 7922 // If the primary and result isn't used, don't bother using X86ISD::AND, 7923 // because a TEST instruction will be better. 7924 bool NonFlagUse = false; 7925 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7926 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7927 SDNode *User = *UI; 7928 unsigned UOpNo = UI.getOperandNo(); 7929 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7930 // Look pass truncate. 7931 UOpNo = User->use_begin().getOperandNo(); 7932 User = *User->use_begin(); 7933 } 7934 7935 if (User->getOpcode() != ISD::BRCOND && 7936 User->getOpcode() != ISD::SETCC && 7937 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7938 NonFlagUse = true; 7939 break; 7940 } 7941 } 7942 7943 if (!NonFlagUse) 7944 break; 7945 } 7946 // FALL THROUGH 7947 case ISD::SUB: 7948 case ISD::OR: 7949 case ISD::XOR: 7950 // Due to the ISEL shortcoming noted above, be conservative if this op is 7951 // likely to be selected as part of a load-modify-store instruction. 7952 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7953 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7954 if (UI->getOpcode() == ISD::STORE) 7955 goto default_case; 7956 7957 // Otherwise use a regular EFLAGS-setting instruction. 7958 switch (Op.getNode()->getOpcode()) { 7959 default: llvm_unreachable("unexpected operator!"); 7960 case ISD::SUB: Opcode = X86ISD::SUB; break; 7961 case ISD::OR: Opcode = X86ISD::OR; break; 7962 case ISD::XOR: Opcode = X86ISD::XOR; break; 7963 case ISD::AND: Opcode = X86ISD::AND; break; 7964 } 7965 7966 NumOperands = 2; 7967 break; 7968 case X86ISD::ADD: 7969 case X86ISD::SUB: 7970 case X86ISD::INC: 7971 case X86ISD::DEC: 7972 case X86ISD::OR: 7973 case X86ISD::XOR: 7974 case X86ISD::AND: 7975 return SDValue(Op.getNode(), 1); 7976 default: 7977 default_case: 7978 break; 7979 } 7980 7981 if (Opcode == 0) 7982 // Emit a CMP with 0, which is the TEST pattern. 7983 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7984 DAG.getConstant(0, Op.getValueType())); 7985 7986 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7987 SmallVector<SDValue, 4> Ops; 7988 for (unsigned i = 0; i != NumOperands; ++i) 7989 Ops.push_back(Op.getOperand(i)); 7990 7991 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7992 DAG.ReplaceAllUsesWith(Op, New); 7993 return SDValue(New.getNode(), 1); 7994} 7995 7996/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7997/// equivalent. 7998SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7999 SelectionDAG &DAG) const { 8000 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8001 if (C->getAPIntValue() == 0) 8002 return EmitTest(Op0, X86CC, DAG); 8003 8004 DebugLoc dl = Op0.getDebugLoc(); 8005 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8006} 8007 8008/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8009/// if it's possible. 8010SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8011 DebugLoc dl, SelectionDAG &DAG) const { 8012 SDValue Op0 = And.getOperand(0); 8013 SDValue Op1 = And.getOperand(1); 8014 if (Op0.getOpcode() == ISD::TRUNCATE) 8015 Op0 = Op0.getOperand(0); 8016 if (Op1.getOpcode() == ISD::TRUNCATE) 8017 Op1 = Op1.getOperand(0); 8018 8019 SDValue LHS, RHS; 8020 if (Op1.getOpcode() == ISD::SHL) 8021 std::swap(Op0, Op1); 8022 if (Op0.getOpcode() == ISD::SHL) { 8023 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8024 if (And00C->getZExtValue() == 1) { 8025 // If we looked past a truncate, check that it's only truncating away 8026 // known zeros. 8027 unsigned BitWidth = Op0.getValueSizeInBits(); 8028 unsigned AndBitWidth = And.getValueSizeInBits(); 8029 if (BitWidth > AndBitWidth) { 8030 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 8031 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 8032 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8033 return SDValue(); 8034 } 8035 LHS = Op1; 8036 RHS = Op0.getOperand(1); 8037 } 8038 } else if (Op1.getOpcode() == ISD::Constant) { 8039 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8040 SDValue AndLHS = Op0; 8041 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 8042 LHS = AndLHS.getOperand(0); 8043 RHS = AndLHS.getOperand(1); 8044 } 8045 } 8046 8047 if (LHS.getNode()) { 8048 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8049 // instruction. Since the shift amount is in-range-or-undefined, we know 8050 // that doing a bittest on the i32 value is ok. We extend to i32 because 8051 // the encoding for the i16 version is larger than the i32 version. 8052 // Also promote i16 to i32 for performance / code size reason. 8053 if (LHS.getValueType() == MVT::i8 || 8054 LHS.getValueType() == MVT::i16) 8055 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8056 8057 // If the operand types disagree, extend the shift amount to match. Since 8058 // BT ignores high bits (like shifts) we can use anyextend. 8059 if (LHS.getValueType() != RHS.getValueType()) 8060 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8061 8062 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8063 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8064 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8065 DAG.getConstant(Cond, MVT::i8), BT); 8066 } 8067 8068 return SDValue(); 8069} 8070 8071SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8072 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8073 SDValue Op0 = Op.getOperand(0); 8074 SDValue Op1 = Op.getOperand(1); 8075 DebugLoc dl = Op.getDebugLoc(); 8076 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8077 8078 // Optimize to BT if possible. 8079 // Lower (X & (1 << N)) == 0 to BT(X, N). 8080 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8081 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8082 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8083 Op1.getOpcode() == ISD::Constant && 8084 cast<ConstantSDNode>(Op1)->isNullValue() && 8085 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8086 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8087 if (NewSetCC.getNode()) 8088 return NewSetCC; 8089 } 8090 8091 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8092 // these. 8093 if (Op1.getOpcode() == ISD::Constant && 8094 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8095 cast<ConstantSDNode>(Op1)->isNullValue()) && 8096 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8097 8098 // If the input is a setcc, then reuse the input setcc or use a new one with 8099 // the inverted condition. 8100 if (Op0.getOpcode() == X86ISD::SETCC) { 8101 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8102 bool Invert = (CC == ISD::SETNE) ^ 8103 cast<ConstantSDNode>(Op1)->isNullValue(); 8104 if (!Invert) return Op0; 8105 8106 CCode = X86::GetOppositeBranchCondition(CCode); 8107 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8108 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8109 } 8110 } 8111 8112 bool isFP = Op1.getValueType().isFloatingPoint(); 8113 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8114 if (X86CC == X86::COND_INVALID) 8115 return SDValue(); 8116 8117 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8118 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8119 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8120} 8121 8122SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8123 SDValue Cond; 8124 SDValue Op0 = Op.getOperand(0); 8125 SDValue Op1 = Op.getOperand(1); 8126 SDValue CC = Op.getOperand(2); 8127 EVT VT = Op.getValueType(); 8128 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8129 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8130 DebugLoc dl = Op.getDebugLoc(); 8131 8132 if (isFP) { 8133 unsigned SSECC = 8; 8134 EVT EltVT = Op0.getValueType().getVectorElementType(); 8135 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 8136 8137 unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 8138 bool Swap = false; 8139 8140 switch (SetCCOpcode) { 8141 default: break; 8142 case ISD::SETOEQ: 8143 case ISD::SETEQ: SSECC = 0; break; 8144 case ISD::SETOGT: 8145 case ISD::SETGT: Swap = true; // Fallthrough 8146 case ISD::SETLT: 8147 case ISD::SETOLT: SSECC = 1; break; 8148 case ISD::SETOGE: 8149 case ISD::SETGE: Swap = true; // Fallthrough 8150 case ISD::SETLE: 8151 case ISD::SETOLE: SSECC = 2; break; 8152 case ISD::SETUO: SSECC = 3; break; 8153 case ISD::SETUNE: 8154 case ISD::SETNE: SSECC = 4; break; 8155 case ISD::SETULE: Swap = true; 8156 case ISD::SETUGE: SSECC = 5; break; 8157 case ISD::SETULT: Swap = true; 8158 case ISD::SETUGT: SSECC = 6; break; 8159 case ISD::SETO: SSECC = 7; break; 8160 } 8161 if (Swap) 8162 std::swap(Op0, Op1); 8163 8164 // In the two special cases we can't handle, emit two comparisons. 8165 if (SSECC == 8) { 8166 if (SetCCOpcode == ISD::SETUEQ) { 8167 SDValue UNORD, EQ; 8168 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 8169 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 8170 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 8171 } 8172 else if (SetCCOpcode == ISD::SETONE) { 8173 SDValue ORD, NEQ; 8174 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 8175 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 8176 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 8177 } 8178 llvm_unreachable("Illegal FP comparison"); 8179 } 8180 // Handle all other FP comparisons here. 8181 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 8182 } 8183 8184 if (!isFP && VT.getSizeInBits() == 256) 8185 return SDValue(); 8186 8187 // We are handling one of the integer comparisons here. Since SSE only has 8188 // GT and EQ comparisons for integer, swapping operands and multiple 8189 // operations may be required for some comparisons. 8190 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 8191 bool Swap = false, Invert = false, FlipSigns = false; 8192 8193 switch (VT.getSimpleVT().SimpleTy) { 8194 default: break; 8195 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 8196 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 8197 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 8198 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 8199 } 8200 8201 switch (SetCCOpcode) { 8202 default: break; 8203 case ISD::SETNE: Invert = true; 8204 case ISD::SETEQ: Opc = EQOpc; break; 8205 case ISD::SETLT: Swap = true; 8206 case ISD::SETGT: Opc = GTOpc; break; 8207 case ISD::SETGE: Swap = true; 8208 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 8209 case ISD::SETULT: Swap = true; 8210 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 8211 case ISD::SETUGE: Swap = true; 8212 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 8213 } 8214 if (Swap) 8215 std::swap(Op0, Op1); 8216 8217 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8218 // bits of the inputs before performing those operations. 8219 if (FlipSigns) { 8220 EVT EltVT = VT.getVectorElementType(); 8221 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8222 EltVT); 8223 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8224 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8225 SignBits.size()); 8226 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8227 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8228 } 8229 8230 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8231 8232 // If the logical-not of the result is required, perform that now. 8233 if (Invert) 8234 Result = DAG.getNOT(dl, Result, VT); 8235 8236 return Result; 8237} 8238 8239// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8240static bool isX86LogicalCmp(SDValue Op) { 8241 unsigned Opc = Op.getNode()->getOpcode(); 8242 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 8243 return true; 8244 if (Op.getResNo() == 1 && 8245 (Opc == X86ISD::ADD || 8246 Opc == X86ISD::SUB || 8247 Opc == X86ISD::ADC || 8248 Opc == X86ISD::SBB || 8249 Opc == X86ISD::SMUL || 8250 Opc == X86ISD::UMUL || 8251 Opc == X86ISD::INC || 8252 Opc == X86ISD::DEC || 8253 Opc == X86ISD::OR || 8254 Opc == X86ISD::XOR || 8255 Opc == X86ISD::AND)) 8256 return true; 8257 8258 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8259 return true; 8260 8261 return false; 8262} 8263 8264static bool isZero(SDValue V) { 8265 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8266 return C && C->isNullValue(); 8267} 8268 8269static bool isAllOnes(SDValue V) { 8270 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8271 return C && C->isAllOnesValue(); 8272} 8273 8274SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8275 bool addTest = true; 8276 SDValue Cond = Op.getOperand(0); 8277 SDValue Op1 = Op.getOperand(1); 8278 SDValue Op2 = Op.getOperand(2); 8279 DebugLoc DL = Op.getDebugLoc(); 8280 SDValue CC; 8281 8282 if (Cond.getOpcode() == ISD::SETCC) { 8283 SDValue NewCond = LowerSETCC(Cond, DAG); 8284 if (NewCond.getNode()) 8285 Cond = NewCond; 8286 } 8287 8288 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8289 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8290 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8291 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8292 if (Cond.getOpcode() == X86ISD::SETCC && 8293 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8294 isZero(Cond.getOperand(1).getOperand(1))) { 8295 SDValue Cmp = Cond.getOperand(1); 8296 8297 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8298 8299 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8300 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8301 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8302 8303 SDValue CmpOp0 = Cmp.getOperand(0); 8304 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8305 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8306 8307 SDValue Res = // Res = 0 or -1. 8308 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8309 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8310 8311 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8312 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8313 8314 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8315 if (N2C == 0 || !N2C->isNullValue()) 8316 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8317 return Res; 8318 } 8319 } 8320 8321 // Look past (and (setcc_carry (cmp ...)), 1). 8322 if (Cond.getOpcode() == ISD::AND && 8323 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8324 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8325 if (C && C->getAPIntValue() == 1) 8326 Cond = Cond.getOperand(0); 8327 } 8328 8329 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8330 // setting operand in place of the X86ISD::SETCC. 8331 if (Cond.getOpcode() == X86ISD::SETCC || 8332 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8333 CC = Cond.getOperand(0); 8334 8335 SDValue Cmp = Cond.getOperand(1); 8336 unsigned Opc = Cmp.getOpcode(); 8337 EVT VT = Op.getValueType(); 8338 8339 bool IllegalFPCMov = false; 8340 if (VT.isFloatingPoint() && !VT.isVector() && 8341 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8342 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8343 8344 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8345 Opc == X86ISD::BT) { // FIXME 8346 Cond = Cmp; 8347 addTest = false; 8348 } 8349 } 8350 8351 if (addTest) { 8352 // Look pass the truncate. 8353 if (Cond.getOpcode() == ISD::TRUNCATE) 8354 Cond = Cond.getOperand(0); 8355 8356 // We know the result of AND is compared against zero. Try to match 8357 // it to BT. 8358 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8359 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8360 if (NewSetCC.getNode()) { 8361 CC = NewSetCC.getOperand(0); 8362 Cond = NewSetCC.getOperand(1); 8363 addTest = false; 8364 } 8365 } 8366 } 8367 8368 if (addTest) { 8369 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8370 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8371 } 8372 8373 // a < b ? -1 : 0 -> RES = ~setcc_carry 8374 // a < b ? 0 : -1 -> RES = setcc_carry 8375 // a >= b ? -1 : 0 -> RES = setcc_carry 8376 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8377 if (Cond.getOpcode() == X86ISD::CMP) { 8378 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8379 8380 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8381 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8382 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8383 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8384 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8385 return DAG.getNOT(DL, Res, Res.getValueType()); 8386 return Res; 8387 } 8388 } 8389 8390 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8391 // condition is true. 8392 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8393 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8394 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8395} 8396 8397// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8398// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8399// from the AND / OR. 8400static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8401 Opc = Op.getOpcode(); 8402 if (Opc != ISD::OR && Opc != ISD::AND) 8403 return false; 8404 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8405 Op.getOperand(0).hasOneUse() && 8406 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8407 Op.getOperand(1).hasOneUse()); 8408} 8409 8410// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8411// 1 and that the SETCC node has a single use. 8412static bool isXor1OfSetCC(SDValue Op) { 8413 if (Op.getOpcode() != ISD::XOR) 8414 return false; 8415 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8416 if (N1C && N1C->getAPIntValue() == 1) { 8417 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8418 Op.getOperand(0).hasOneUse(); 8419 } 8420 return false; 8421} 8422 8423SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8424 bool addTest = true; 8425 SDValue Chain = Op.getOperand(0); 8426 SDValue Cond = Op.getOperand(1); 8427 SDValue Dest = Op.getOperand(2); 8428 DebugLoc dl = Op.getDebugLoc(); 8429 SDValue CC; 8430 8431 if (Cond.getOpcode() == ISD::SETCC) { 8432 SDValue NewCond = LowerSETCC(Cond, DAG); 8433 if (NewCond.getNode()) 8434 Cond = NewCond; 8435 } 8436#if 0 8437 // FIXME: LowerXALUO doesn't handle these!! 8438 else if (Cond.getOpcode() == X86ISD::ADD || 8439 Cond.getOpcode() == X86ISD::SUB || 8440 Cond.getOpcode() == X86ISD::SMUL || 8441 Cond.getOpcode() == X86ISD::UMUL) 8442 Cond = LowerXALUO(Cond, DAG); 8443#endif 8444 8445 // Look pass (and (setcc_carry (cmp ...)), 1). 8446 if (Cond.getOpcode() == ISD::AND && 8447 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8448 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8449 if (C && C->getAPIntValue() == 1) 8450 Cond = Cond.getOperand(0); 8451 } 8452 8453 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8454 // setting operand in place of the X86ISD::SETCC. 8455 if (Cond.getOpcode() == X86ISD::SETCC || 8456 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8457 CC = Cond.getOperand(0); 8458 8459 SDValue Cmp = Cond.getOperand(1); 8460 unsigned Opc = Cmp.getOpcode(); 8461 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8462 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8463 Cond = Cmp; 8464 addTest = false; 8465 } else { 8466 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8467 default: break; 8468 case X86::COND_O: 8469 case X86::COND_B: 8470 // These can only come from an arithmetic instruction with overflow, 8471 // e.g. SADDO, UADDO. 8472 Cond = Cond.getNode()->getOperand(1); 8473 addTest = false; 8474 break; 8475 } 8476 } 8477 } else { 8478 unsigned CondOpc; 8479 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8480 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8481 if (CondOpc == ISD::OR) { 8482 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8483 // two branches instead of an explicit OR instruction with a 8484 // separate test. 8485 if (Cmp == Cond.getOperand(1).getOperand(1) && 8486 isX86LogicalCmp(Cmp)) { 8487 CC = Cond.getOperand(0).getOperand(0); 8488 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8489 Chain, Dest, CC, Cmp); 8490 CC = Cond.getOperand(1).getOperand(0); 8491 Cond = Cmp; 8492 addTest = false; 8493 } 8494 } else { // ISD::AND 8495 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8496 // two branches instead of an explicit AND instruction with a 8497 // separate test. However, we only do this if this block doesn't 8498 // have a fall-through edge, because this requires an explicit 8499 // jmp when the condition is false. 8500 if (Cmp == Cond.getOperand(1).getOperand(1) && 8501 isX86LogicalCmp(Cmp) && 8502 Op.getNode()->hasOneUse()) { 8503 X86::CondCode CCode = 8504 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8505 CCode = X86::GetOppositeBranchCondition(CCode); 8506 CC = DAG.getConstant(CCode, MVT::i8); 8507 SDNode *User = *Op.getNode()->use_begin(); 8508 // Look for an unconditional branch following this conditional branch. 8509 // We need this because we need to reverse the successors in order 8510 // to implement FCMP_OEQ. 8511 if (User->getOpcode() == ISD::BR) { 8512 SDValue FalseBB = User->getOperand(1); 8513 SDNode *NewBR = 8514 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8515 assert(NewBR == User); 8516 (void)NewBR; 8517 Dest = FalseBB; 8518 8519 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8520 Chain, Dest, CC, Cmp); 8521 X86::CondCode CCode = 8522 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8523 CCode = X86::GetOppositeBranchCondition(CCode); 8524 CC = DAG.getConstant(CCode, MVT::i8); 8525 Cond = Cmp; 8526 addTest = false; 8527 } 8528 } 8529 } 8530 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8531 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8532 // It should be transformed during dag combiner except when the condition 8533 // is set by a arithmetics with overflow node. 8534 X86::CondCode CCode = 8535 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8536 CCode = X86::GetOppositeBranchCondition(CCode); 8537 CC = DAG.getConstant(CCode, MVT::i8); 8538 Cond = Cond.getOperand(0).getOperand(1); 8539 addTest = false; 8540 } 8541 } 8542 8543 if (addTest) { 8544 // Look pass the truncate. 8545 if (Cond.getOpcode() == ISD::TRUNCATE) 8546 Cond = Cond.getOperand(0); 8547 8548 // We know the result of AND is compared against zero. Try to match 8549 // it to BT. 8550 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8551 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8552 if (NewSetCC.getNode()) { 8553 CC = NewSetCC.getOperand(0); 8554 Cond = NewSetCC.getOperand(1); 8555 addTest = false; 8556 } 8557 } 8558 } 8559 8560 if (addTest) { 8561 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8562 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8563 } 8564 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8565 Chain, Dest, CC, Cond); 8566} 8567 8568 8569// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8570// Calls to _alloca is needed to probe the stack when allocating more than 4k 8571// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8572// that the guard pages used by the OS virtual memory manager are allocated in 8573// correct sequence. 8574SDValue 8575X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8576 SelectionDAG &DAG) const { 8577 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 8578 "This should be used only on Windows targets"); 8579 assert(!Subtarget->isTargetEnvMacho()); 8580 DebugLoc dl = Op.getDebugLoc(); 8581 8582 // Get the inputs. 8583 SDValue Chain = Op.getOperand(0); 8584 SDValue Size = Op.getOperand(1); 8585 // FIXME: Ensure alignment here 8586 8587 SDValue Flag; 8588 8589 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 8590 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8591 8592 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8593 Flag = Chain.getValue(1); 8594 8595 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8596 8597 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8598 Flag = Chain.getValue(1); 8599 8600 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8601 8602 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8603 return DAG.getMergeValues(Ops1, 2, dl); 8604} 8605 8606SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8607 MachineFunction &MF = DAG.getMachineFunction(); 8608 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8609 8610 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8611 DebugLoc DL = Op.getDebugLoc(); 8612 8613 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8614 // vastart just stores the address of the VarArgsFrameIndex slot into the 8615 // memory location argument. 8616 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8617 getPointerTy()); 8618 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8619 MachinePointerInfo(SV), false, false, 0); 8620 } 8621 8622 // __va_list_tag: 8623 // gp_offset (0 - 6 * 8) 8624 // fp_offset (48 - 48 + 8 * 16) 8625 // overflow_arg_area (point to parameters coming in memory). 8626 // reg_save_area 8627 SmallVector<SDValue, 8> MemOps; 8628 SDValue FIN = Op.getOperand(1); 8629 // Store gp_offset 8630 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 8631 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 8632 MVT::i32), 8633 FIN, MachinePointerInfo(SV), false, false, 0); 8634 MemOps.push_back(Store); 8635 8636 // Store fp_offset 8637 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8638 FIN, DAG.getIntPtrConstant(4)); 8639 Store = DAG.getStore(Op.getOperand(0), DL, 8640 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8641 MVT::i32), 8642 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8643 MemOps.push_back(Store); 8644 8645 // Store ptr to overflow_arg_area 8646 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8647 FIN, DAG.getIntPtrConstant(4)); 8648 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8649 getPointerTy()); 8650 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8651 MachinePointerInfo(SV, 8), 8652 false, false, 0); 8653 MemOps.push_back(Store); 8654 8655 // Store ptr to reg_save_area. 8656 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8657 FIN, DAG.getIntPtrConstant(8)); 8658 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8659 getPointerTy()); 8660 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8661 MachinePointerInfo(SV, 16), false, false, 0); 8662 MemOps.push_back(Store); 8663 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8664 &MemOps[0], MemOps.size()); 8665} 8666 8667SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8668 assert(Subtarget->is64Bit() && 8669 "LowerVAARG only handles 64-bit va_arg!"); 8670 assert((Subtarget->isTargetLinux() || 8671 Subtarget->isTargetDarwin()) && 8672 "Unhandled target in LowerVAARG"); 8673 assert(Op.getNode()->getNumOperands() == 4); 8674 SDValue Chain = Op.getOperand(0); 8675 SDValue SrcPtr = Op.getOperand(1); 8676 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8677 unsigned Align = Op.getConstantOperandVal(3); 8678 DebugLoc dl = Op.getDebugLoc(); 8679 8680 EVT ArgVT = Op.getNode()->getValueType(0); 8681 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8682 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8683 uint8_t ArgMode; 8684 8685 // Decide which area this value should be read from. 8686 // TODO: Implement the AMD64 ABI in its entirety. This simple 8687 // selection mechanism works only for the basic types. 8688 if (ArgVT == MVT::f80) { 8689 llvm_unreachable("va_arg for f80 not yet implemented"); 8690 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8691 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8692 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8693 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8694 } else { 8695 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8696 } 8697 8698 if (ArgMode == 2) { 8699 // Sanity Check: Make sure using fp_offset makes sense. 8700 assert(!UseSoftFloat && 8701 !(DAG.getMachineFunction() 8702 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8703 Subtarget->hasXMM()); 8704 } 8705 8706 // Insert VAARG_64 node into the DAG 8707 // VAARG_64 returns two values: Variable Argument Address, Chain 8708 SmallVector<SDValue, 11> InstOps; 8709 InstOps.push_back(Chain); 8710 InstOps.push_back(SrcPtr); 8711 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8712 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8713 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8714 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8715 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8716 VTs, &InstOps[0], InstOps.size(), 8717 MVT::i64, 8718 MachinePointerInfo(SV), 8719 /*Align=*/0, 8720 /*Volatile=*/false, 8721 /*ReadMem=*/true, 8722 /*WriteMem=*/true); 8723 Chain = VAARG.getValue(1); 8724 8725 // Load the next argument and return it 8726 return DAG.getLoad(ArgVT, dl, 8727 Chain, 8728 VAARG, 8729 MachinePointerInfo(), 8730 false, false, 0); 8731} 8732 8733SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8734 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8735 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8736 SDValue Chain = Op.getOperand(0); 8737 SDValue DstPtr = Op.getOperand(1); 8738 SDValue SrcPtr = Op.getOperand(2); 8739 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8740 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8741 DebugLoc DL = Op.getDebugLoc(); 8742 8743 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8744 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8745 false, 8746 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8747} 8748 8749SDValue 8750X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8751 DebugLoc dl = Op.getDebugLoc(); 8752 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8753 switch (IntNo) { 8754 default: return SDValue(); // Don't custom lower most intrinsics. 8755 // Comparison intrinsics. 8756 case Intrinsic::x86_sse_comieq_ss: 8757 case Intrinsic::x86_sse_comilt_ss: 8758 case Intrinsic::x86_sse_comile_ss: 8759 case Intrinsic::x86_sse_comigt_ss: 8760 case Intrinsic::x86_sse_comige_ss: 8761 case Intrinsic::x86_sse_comineq_ss: 8762 case Intrinsic::x86_sse_ucomieq_ss: 8763 case Intrinsic::x86_sse_ucomilt_ss: 8764 case Intrinsic::x86_sse_ucomile_ss: 8765 case Intrinsic::x86_sse_ucomigt_ss: 8766 case Intrinsic::x86_sse_ucomige_ss: 8767 case Intrinsic::x86_sse_ucomineq_ss: 8768 case Intrinsic::x86_sse2_comieq_sd: 8769 case Intrinsic::x86_sse2_comilt_sd: 8770 case Intrinsic::x86_sse2_comile_sd: 8771 case Intrinsic::x86_sse2_comigt_sd: 8772 case Intrinsic::x86_sse2_comige_sd: 8773 case Intrinsic::x86_sse2_comineq_sd: 8774 case Intrinsic::x86_sse2_ucomieq_sd: 8775 case Intrinsic::x86_sse2_ucomilt_sd: 8776 case Intrinsic::x86_sse2_ucomile_sd: 8777 case Intrinsic::x86_sse2_ucomigt_sd: 8778 case Intrinsic::x86_sse2_ucomige_sd: 8779 case Intrinsic::x86_sse2_ucomineq_sd: { 8780 unsigned Opc = 0; 8781 ISD::CondCode CC = ISD::SETCC_INVALID; 8782 switch (IntNo) { 8783 default: break; 8784 case Intrinsic::x86_sse_comieq_ss: 8785 case Intrinsic::x86_sse2_comieq_sd: 8786 Opc = X86ISD::COMI; 8787 CC = ISD::SETEQ; 8788 break; 8789 case Intrinsic::x86_sse_comilt_ss: 8790 case Intrinsic::x86_sse2_comilt_sd: 8791 Opc = X86ISD::COMI; 8792 CC = ISD::SETLT; 8793 break; 8794 case Intrinsic::x86_sse_comile_ss: 8795 case Intrinsic::x86_sse2_comile_sd: 8796 Opc = X86ISD::COMI; 8797 CC = ISD::SETLE; 8798 break; 8799 case Intrinsic::x86_sse_comigt_ss: 8800 case Intrinsic::x86_sse2_comigt_sd: 8801 Opc = X86ISD::COMI; 8802 CC = ISD::SETGT; 8803 break; 8804 case Intrinsic::x86_sse_comige_ss: 8805 case Intrinsic::x86_sse2_comige_sd: 8806 Opc = X86ISD::COMI; 8807 CC = ISD::SETGE; 8808 break; 8809 case Intrinsic::x86_sse_comineq_ss: 8810 case Intrinsic::x86_sse2_comineq_sd: 8811 Opc = X86ISD::COMI; 8812 CC = ISD::SETNE; 8813 break; 8814 case Intrinsic::x86_sse_ucomieq_ss: 8815 case Intrinsic::x86_sse2_ucomieq_sd: 8816 Opc = X86ISD::UCOMI; 8817 CC = ISD::SETEQ; 8818 break; 8819 case Intrinsic::x86_sse_ucomilt_ss: 8820 case Intrinsic::x86_sse2_ucomilt_sd: 8821 Opc = X86ISD::UCOMI; 8822 CC = ISD::SETLT; 8823 break; 8824 case Intrinsic::x86_sse_ucomile_ss: 8825 case Intrinsic::x86_sse2_ucomile_sd: 8826 Opc = X86ISD::UCOMI; 8827 CC = ISD::SETLE; 8828 break; 8829 case Intrinsic::x86_sse_ucomigt_ss: 8830 case Intrinsic::x86_sse2_ucomigt_sd: 8831 Opc = X86ISD::UCOMI; 8832 CC = ISD::SETGT; 8833 break; 8834 case Intrinsic::x86_sse_ucomige_ss: 8835 case Intrinsic::x86_sse2_ucomige_sd: 8836 Opc = X86ISD::UCOMI; 8837 CC = ISD::SETGE; 8838 break; 8839 case Intrinsic::x86_sse_ucomineq_ss: 8840 case Intrinsic::x86_sse2_ucomineq_sd: 8841 Opc = X86ISD::UCOMI; 8842 CC = ISD::SETNE; 8843 break; 8844 } 8845 8846 SDValue LHS = Op.getOperand(1); 8847 SDValue RHS = Op.getOperand(2); 8848 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8849 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8850 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8851 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8852 DAG.getConstant(X86CC, MVT::i8), Cond); 8853 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8854 } 8855 // ptest and testp intrinsics. The intrinsic these come from are designed to 8856 // return an integer value, not just an instruction so lower it to the ptest 8857 // or testp pattern and a setcc for the result. 8858 case Intrinsic::x86_sse41_ptestz: 8859 case Intrinsic::x86_sse41_ptestc: 8860 case Intrinsic::x86_sse41_ptestnzc: 8861 case Intrinsic::x86_avx_ptestz_256: 8862 case Intrinsic::x86_avx_ptestc_256: 8863 case Intrinsic::x86_avx_ptestnzc_256: 8864 case Intrinsic::x86_avx_vtestz_ps: 8865 case Intrinsic::x86_avx_vtestc_ps: 8866 case Intrinsic::x86_avx_vtestnzc_ps: 8867 case Intrinsic::x86_avx_vtestz_pd: 8868 case Intrinsic::x86_avx_vtestc_pd: 8869 case Intrinsic::x86_avx_vtestnzc_pd: 8870 case Intrinsic::x86_avx_vtestz_ps_256: 8871 case Intrinsic::x86_avx_vtestc_ps_256: 8872 case Intrinsic::x86_avx_vtestnzc_ps_256: 8873 case Intrinsic::x86_avx_vtestz_pd_256: 8874 case Intrinsic::x86_avx_vtestc_pd_256: 8875 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8876 bool IsTestPacked = false; 8877 unsigned X86CC = 0; 8878 switch (IntNo) { 8879 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8880 case Intrinsic::x86_avx_vtestz_ps: 8881 case Intrinsic::x86_avx_vtestz_pd: 8882 case Intrinsic::x86_avx_vtestz_ps_256: 8883 case Intrinsic::x86_avx_vtestz_pd_256: 8884 IsTestPacked = true; // Fallthrough 8885 case Intrinsic::x86_sse41_ptestz: 8886 case Intrinsic::x86_avx_ptestz_256: 8887 // ZF = 1 8888 X86CC = X86::COND_E; 8889 break; 8890 case Intrinsic::x86_avx_vtestc_ps: 8891 case Intrinsic::x86_avx_vtestc_pd: 8892 case Intrinsic::x86_avx_vtestc_ps_256: 8893 case Intrinsic::x86_avx_vtestc_pd_256: 8894 IsTestPacked = true; // Fallthrough 8895 case Intrinsic::x86_sse41_ptestc: 8896 case Intrinsic::x86_avx_ptestc_256: 8897 // CF = 1 8898 X86CC = X86::COND_B; 8899 break; 8900 case Intrinsic::x86_avx_vtestnzc_ps: 8901 case Intrinsic::x86_avx_vtestnzc_pd: 8902 case Intrinsic::x86_avx_vtestnzc_ps_256: 8903 case Intrinsic::x86_avx_vtestnzc_pd_256: 8904 IsTestPacked = true; // Fallthrough 8905 case Intrinsic::x86_sse41_ptestnzc: 8906 case Intrinsic::x86_avx_ptestnzc_256: 8907 // ZF and CF = 0 8908 X86CC = X86::COND_A; 8909 break; 8910 } 8911 8912 SDValue LHS = Op.getOperand(1); 8913 SDValue RHS = Op.getOperand(2); 8914 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8915 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8916 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8917 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8918 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8919 } 8920 8921 // Fix vector shift instructions where the last operand is a non-immediate 8922 // i32 value. 8923 case Intrinsic::x86_sse2_pslli_w: 8924 case Intrinsic::x86_sse2_pslli_d: 8925 case Intrinsic::x86_sse2_pslli_q: 8926 case Intrinsic::x86_sse2_psrli_w: 8927 case Intrinsic::x86_sse2_psrli_d: 8928 case Intrinsic::x86_sse2_psrli_q: 8929 case Intrinsic::x86_sse2_psrai_w: 8930 case Intrinsic::x86_sse2_psrai_d: 8931 case Intrinsic::x86_mmx_pslli_w: 8932 case Intrinsic::x86_mmx_pslli_d: 8933 case Intrinsic::x86_mmx_pslli_q: 8934 case Intrinsic::x86_mmx_psrli_w: 8935 case Intrinsic::x86_mmx_psrli_d: 8936 case Intrinsic::x86_mmx_psrli_q: 8937 case Intrinsic::x86_mmx_psrai_w: 8938 case Intrinsic::x86_mmx_psrai_d: { 8939 SDValue ShAmt = Op.getOperand(2); 8940 if (isa<ConstantSDNode>(ShAmt)) 8941 return SDValue(); 8942 8943 unsigned NewIntNo = 0; 8944 EVT ShAmtVT = MVT::v4i32; 8945 switch (IntNo) { 8946 case Intrinsic::x86_sse2_pslli_w: 8947 NewIntNo = Intrinsic::x86_sse2_psll_w; 8948 break; 8949 case Intrinsic::x86_sse2_pslli_d: 8950 NewIntNo = Intrinsic::x86_sse2_psll_d; 8951 break; 8952 case Intrinsic::x86_sse2_pslli_q: 8953 NewIntNo = Intrinsic::x86_sse2_psll_q; 8954 break; 8955 case Intrinsic::x86_sse2_psrli_w: 8956 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8957 break; 8958 case Intrinsic::x86_sse2_psrli_d: 8959 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8960 break; 8961 case Intrinsic::x86_sse2_psrli_q: 8962 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8963 break; 8964 case Intrinsic::x86_sse2_psrai_w: 8965 NewIntNo = Intrinsic::x86_sse2_psra_w; 8966 break; 8967 case Intrinsic::x86_sse2_psrai_d: 8968 NewIntNo = Intrinsic::x86_sse2_psra_d; 8969 break; 8970 default: { 8971 ShAmtVT = MVT::v2i32; 8972 switch (IntNo) { 8973 case Intrinsic::x86_mmx_pslli_w: 8974 NewIntNo = Intrinsic::x86_mmx_psll_w; 8975 break; 8976 case Intrinsic::x86_mmx_pslli_d: 8977 NewIntNo = Intrinsic::x86_mmx_psll_d; 8978 break; 8979 case Intrinsic::x86_mmx_pslli_q: 8980 NewIntNo = Intrinsic::x86_mmx_psll_q; 8981 break; 8982 case Intrinsic::x86_mmx_psrli_w: 8983 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8984 break; 8985 case Intrinsic::x86_mmx_psrli_d: 8986 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8987 break; 8988 case Intrinsic::x86_mmx_psrli_q: 8989 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8990 break; 8991 case Intrinsic::x86_mmx_psrai_w: 8992 NewIntNo = Intrinsic::x86_mmx_psra_w; 8993 break; 8994 case Intrinsic::x86_mmx_psrai_d: 8995 NewIntNo = Intrinsic::x86_mmx_psra_d; 8996 break; 8997 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8998 } 8999 break; 9000 } 9001 } 9002 9003 // The vector shift intrinsics with scalars uses 32b shift amounts but 9004 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 9005 // to be zero. 9006 SDValue ShOps[4]; 9007 ShOps[0] = ShAmt; 9008 ShOps[1] = DAG.getConstant(0, MVT::i32); 9009 if (ShAmtVT == MVT::v4i32) { 9010 ShOps[2] = DAG.getUNDEF(MVT::i32); 9011 ShOps[3] = DAG.getUNDEF(MVT::i32); 9012 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 9013 } else { 9014 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 9015// FIXME this must be lowered to get rid of the invalid type. 9016 } 9017 9018 EVT VT = Op.getValueType(); 9019 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 9020 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9021 DAG.getConstant(NewIntNo, MVT::i32), 9022 Op.getOperand(1), ShAmt); 9023 } 9024 } 9025} 9026 9027SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 9028 SelectionDAG &DAG) const { 9029 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9030 MFI->setReturnAddressIsTaken(true); 9031 9032 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9033 DebugLoc dl = Op.getDebugLoc(); 9034 9035 if (Depth > 0) { 9036 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 9037 SDValue Offset = 9038 DAG.getConstant(TD->getPointerSize(), 9039 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 9040 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9041 DAG.getNode(ISD::ADD, dl, getPointerTy(), 9042 FrameAddr, Offset), 9043 MachinePointerInfo(), false, false, 0); 9044 } 9045 9046 // Just load the return address. 9047 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 9048 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9049 RetAddrFI, MachinePointerInfo(), false, false, 0); 9050} 9051 9052SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 9053 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9054 MFI->setFrameAddressIsTaken(true); 9055 9056 EVT VT = Op.getValueType(); 9057 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 9058 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9059 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 9060 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 9061 while (Depth--) 9062 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 9063 MachinePointerInfo(), 9064 false, false, 0); 9065 return FrameAddr; 9066} 9067 9068SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 9069 SelectionDAG &DAG) const { 9070 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 9071} 9072 9073SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 9074 MachineFunction &MF = DAG.getMachineFunction(); 9075 SDValue Chain = Op.getOperand(0); 9076 SDValue Offset = Op.getOperand(1); 9077 SDValue Handler = Op.getOperand(2); 9078 DebugLoc dl = Op.getDebugLoc(); 9079 9080 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 9081 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 9082 getPointerTy()); 9083 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 9084 9085 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 9086 DAG.getIntPtrConstant(TD->getPointerSize())); 9087 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 9088 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 9089 false, false, 0); 9090 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 9091 MF.getRegInfo().addLiveOut(StoreAddrReg); 9092 9093 return DAG.getNode(X86ISD::EH_RETURN, dl, 9094 MVT::Other, 9095 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 9096} 9097 9098SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 9099 SelectionDAG &DAG) const { 9100 SDValue Root = Op.getOperand(0); 9101 SDValue Trmp = Op.getOperand(1); // trampoline 9102 SDValue FPtr = Op.getOperand(2); // nested function 9103 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 9104 DebugLoc dl = Op.getDebugLoc(); 9105 9106 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9107 9108 if (Subtarget->is64Bit()) { 9109 SDValue OutChains[6]; 9110 9111 // Large code-model. 9112 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 9113 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 9114 9115 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 9116 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 9117 9118 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 9119 9120 // Load the pointer to the nested function into R11. 9121 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 9122 SDValue Addr = Trmp; 9123 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9124 Addr, MachinePointerInfo(TrmpAddr), 9125 false, false, 0); 9126 9127 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9128 DAG.getConstant(2, MVT::i64)); 9129 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 9130 MachinePointerInfo(TrmpAddr, 2), 9131 false, false, 2); 9132 9133 // Load the 'nest' parameter value into R10. 9134 // R10 is specified in X86CallingConv.td 9135 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 9136 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9137 DAG.getConstant(10, MVT::i64)); 9138 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9139 Addr, MachinePointerInfo(TrmpAddr, 10), 9140 false, false, 0); 9141 9142 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9143 DAG.getConstant(12, MVT::i64)); 9144 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 9145 MachinePointerInfo(TrmpAddr, 12), 9146 false, false, 2); 9147 9148 // Jump to the nested function. 9149 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 9150 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9151 DAG.getConstant(20, MVT::i64)); 9152 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9153 Addr, MachinePointerInfo(TrmpAddr, 20), 9154 false, false, 0); 9155 9156 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 9157 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9158 DAG.getConstant(22, MVT::i64)); 9159 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 9160 MachinePointerInfo(TrmpAddr, 22), 9161 false, false, 0); 9162 9163 SDValue Ops[] = 9164 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 9165 return DAG.getMergeValues(Ops, 2, dl); 9166 } else { 9167 const Function *Func = 9168 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 9169 CallingConv::ID CC = Func->getCallingConv(); 9170 unsigned NestReg; 9171 9172 switch (CC) { 9173 default: 9174 llvm_unreachable("Unsupported calling convention"); 9175 case CallingConv::C: 9176 case CallingConv::X86_StdCall: { 9177 // Pass 'nest' parameter in ECX. 9178 // Must be kept in sync with X86CallingConv.td 9179 NestReg = X86::ECX; 9180 9181 // Check that ECX wasn't needed by an 'inreg' parameter. 9182 FunctionType *FTy = Func->getFunctionType(); 9183 const AttrListPtr &Attrs = Func->getAttributes(); 9184 9185 if (!Attrs.isEmpty() && !Func->isVarArg()) { 9186 unsigned InRegCount = 0; 9187 unsigned Idx = 1; 9188 9189 for (FunctionType::param_iterator I = FTy->param_begin(), 9190 E = FTy->param_end(); I != E; ++I, ++Idx) 9191 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 9192 // FIXME: should only count parameters that are lowered to integers. 9193 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 9194 9195 if (InRegCount > 2) { 9196 report_fatal_error("Nest register in use - reduce number of inreg" 9197 " parameters!"); 9198 } 9199 } 9200 break; 9201 } 9202 case CallingConv::X86_FastCall: 9203 case CallingConv::X86_ThisCall: 9204 case CallingConv::Fast: 9205 // Pass 'nest' parameter in EAX. 9206 // Must be kept in sync with X86CallingConv.td 9207 NestReg = X86::EAX; 9208 break; 9209 } 9210 9211 SDValue OutChains[4]; 9212 SDValue Addr, Disp; 9213 9214 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9215 DAG.getConstant(10, MVT::i32)); 9216 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 9217 9218 // This is storing the opcode for MOV32ri. 9219 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 9220 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 9221 OutChains[0] = DAG.getStore(Root, dl, 9222 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 9223 Trmp, MachinePointerInfo(TrmpAddr), 9224 false, false, 0); 9225 9226 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9227 DAG.getConstant(1, MVT::i32)); 9228 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 9229 MachinePointerInfo(TrmpAddr, 1), 9230 false, false, 1); 9231 9232 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 9233 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9234 DAG.getConstant(5, MVT::i32)); 9235 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 9236 MachinePointerInfo(TrmpAddr, 5), 9237 false, false, 1); 9238 9239 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9240 DAG.getConstant(6, MVT::i32)); 9241 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 9242 MachinePointerInfo(TrmpAddr, 6), 9243 false, false, 1); 9244 9245 SDValue Ops[] = 9246 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 9247 return DAG.getMergeValues(Ops, 2, dl); 9248 } 9249} 9250 9251SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 9252 SelectionDAG &DAG) const { 9253 /* 9254 The rounding mode is in bits 11:10 of FPSR, and has the following 9255 settings: 9256 00 Round to nearest 9257 01 Round to -inf 9258 10 Round to +inf 9259 11 Round to 0 9260 9261 FLT_ROUNDS, on the other hand, expects the following: 9262 -1 Undefined 9263 0 Round to 0 9264 1 Round to nearest 9265 2 Round to +inf 9266 3 Round to -inf 9267 9268 To perform the conversion, we do: 9269 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 9270 */ 9271 9272 MachineFunction &MF = DAG.getMachineFunction(); 9273 const TargetMachine &TM = MF.getTarget(); 9274 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 9275 unsigned StackAlignment = TFI.getStackAlignment(); 9276 EVT VT = Op.getValueType(); 9277 DebugLoc DL = Op.getDebugLoc(); 9278 9279 // Save FP Control Word to stack slot 9280 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 9281 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 9282 9283 9284 MachineMemOperand *MMO = 9285 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 9286 MachineMemOperand::MOStore, 2, 2); 9287 9288 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 9289 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 9290 DAG.getVTList(MVT::Other), 9291 Ops, 2, MVT::i16, MMO); 9292 9293 // Load FP Control Word from stack slot 9294 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 9295 MachinePointerInfo(), false, false, 0); 9296 9297 // Transform as necessary 9298 SDValue CWD1 = 9299 DAG.getNode(ISD::SRL, DL, MVT::i16, 9300 DAG.getNode(ISD::AND, DL, MVT::i16, 9301 CWD, DAG.getConstant(0x800, MVT::i16)), 9302 DAG.getConstant(11, MVT::i8)); 9303 SDValue CWD2 = 9304 DAG.getNode(ISD::SRL, DL, MVT::i16, 9305 DAG.getNode(ISD::AND, DL, MVT::i16, 9306 CWD, DAG.getConstant(0x400, MVT::i16)), 9307 DAG.getConstant(9, MVT::i8)); 9308 9309 SDValue RetVal = 9310 DAG.getNode(ISD::AND, DL, MVT::i16, 9311 DAG.getNode(ISD::ADD, DL, MVT::i16, 9312 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 9313 DAG.getConstant(1, MVT::i16)), 9314 DAG.getConstant(3, MVT::i16)); 9315 9316 9317 return DAG.getNode((VT.getSizeInBits() < 16 ? 9318 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 9319} 9320 9321SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9322 EVT VT = Op.getValueType(); 9323 EVT OpVT = VT; 9324 unsigned NumBits = VT.getSizeInBits(); 9325 DebugLoc dl = Op.getDebugLoc(); 9326 9327 Op = Op.getOperand(0); 9328 if (VT == MVT::i8) { 9329 // Zero extend to i32 since there is not an i8 bsr. 9330 OpVT = MVT::i32; 9331 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9332 } 9333 9334 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 9335 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9336 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9337 9338 // If src is zero (i.e. bsr sets ZF), returns NumBits. 9339 SDValue Ops[] = { 9340 Op, 9341 DAG.getConstant(NumBits+NumBits-1, OpVT), 9342 DAG.getConstant(X86::COND_E, MVT::i8), 9343 Op.getValue(1) 9344 }; 9345 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9346 9347 // Finally xor with NumBits-1. 9348 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9349 9350 if (VT == MVT::i8) 9351 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9352 return Op; 9353} 9354 9355SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9356 EVT VT = Op.getValueType(); 9357 EVT OpVT = VT; 9358 unsigned NumBits = VT.getSizeInBits(); 9359 DebugLoc dl = Op.getDebugLoc(); 9360 9361 Op = Op.getOperand(0); 9362 if (VT == MVT::i8) { 9363 OpVT = MVT::i32; 9364 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9365 } 9366 9367 // Issue a bsf (scan bits forward) which also sets EFLAGS. 9368 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9369 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 9370 9371 // If src is zero (i.e. bsf sets ZF), returns NumBits. 9372 SDValue Ops[] = { 9373 Op, 9374 DAG.getConstant(NumBits, OpVT), 9375 DAG.getConstant(X86::COND_E, MVT::i8), 9376 Op.getValue(1) 9377 }; 9378 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9379 9380 if (VT == MVT::i8) 9381 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9382 return Op; 9383} 9384 9385SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 9386 EVT VT = Op.getValueType(); 9387 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 9388 DebugLoc dl = Op.getDebugLoc(); 9389 9390 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 9391 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 9392 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 9393 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 9394 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 9395 // 9396 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 9397 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 9398 // return AloBlo + AloBhi + AhiBlo; 9399 9400 SDValue A = Op.getOperand(0); 9401 SDValue B = Op.getOperand(1); 9402 9403 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9404 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9405 A, DAG.getConstant(32, MVT::i32)); 9406 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9407 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9408 B, DAG.getConstant(32, MVT::i32)); 9409 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9410 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9411 A, B); 9412 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9413 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9414 A, Bhi); 9415 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9416 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9417 Ahi, B); 9418 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9419 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9420 AloBhi, DAG.getConstant(32, MVT::i32)); 9421 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9422 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9423 AhiBlo, DAG.getConstant(32, MVT::i32)); 9424 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9425 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9426 return Res; 9427} 9428 9429SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 9430 9431 EVT VT = Op.getValueType(); 9432 DebugLoc dl = Op.getDebugLoc(); 9433 SDValue R = Op.getOperand(0); 9434 SDValue Amt = Op.getOperand(1); 9435 LLVMContext *Context = DAG.getContext(); 9436 9437 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 9438 return SDValue(); 9439 9440 // Decompose 256-bit shifts into smaller 128-bit shifts. 9441 if (VT.getSizeInBits() == 256) { 9442 int NumElems = VT.getVectorNumElements(); 9443 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9444 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9445 9446 // Extract the two vectors 9447 SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); 9448 SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), 9449 DAG, dl); 9450 9451 // Recreate the shift amount vectors 9452 SmallVector<SDValue, 4> Amt1Csts; 9453 SmallVector<SDValue, 4> Amt2Csts; 9454 for (int i = 0; i < NumElems/2; ++i) 9455 Amt1Csts.push_back(Amt->getOperand(i)); 9456 for (int i = NumElems/2; i < NumElems; ++i) 9457 Amt2Csts.push_back(Amt->getOperand(i)); 9458 9459 SDValue Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9460 &Amt1Csts[0], NumElems/2); 9461 SDValue Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 9462 &Amt2Csts[0], NumElems/2); 9463 9464 // Issue new vector shifts for the smaller types 9465 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 9466 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 9467 9468 // Concatenate the result back 9469 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 9470 } 9471 9472 // Optimize shl/srl/sra with constant shift amount. 9473 if (isSplatVector(Amt.getNode())) { 9474 SDValue SclrAmt = Amt->getOperand(0); 9475 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 9476 uint64_t ShiftAmt = C->getZExtValue(); 9477 9478 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 9479 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9480 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9481 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9482 9483 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 9484 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9485 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9486 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9487 9488 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 9489 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9490 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9491 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9492 9493 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 9494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9495 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9496 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9497 9498 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 9499 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9500 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9501 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9502 9503 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 9504 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9505 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9506 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9507 9508 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 9509 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9510 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9511 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9512 9513 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 9514 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9515 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9516 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9517 } 9518 } 9519 9520 // Lower SHL with variable shift amount. 9521 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 9522 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9523 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9524 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 9525 9526 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 9527 9528 std::vector<Constant*> CV(4, CI); 9529 Constant *C = ConstantVector::get(CV); 9530 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9531 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9532 MachinePointerInfo::getConstantPool(), 9533 false, false, 16); 9534 9535 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 9536 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 9537 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 9538 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 9539 } 9540 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 9541 // a = a << 5; 9542 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9543 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9544 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 9545 9546 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 9547 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 9548 9549 std::vector<Constant*> CVM1(16, CM1); 9550 std::vector<Constant*> CVM2(16, CM2); 9551 Constant *C = ConstantVector::get(CVM1); 9552 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9553 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9554 MachinePointerInfo::getConstantPool(), 9555 false, false, 16); 9556 9557 // r = pblendv(r, psllw(r & (char16)15, 4), a); 9558 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9559 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9560 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9561 DAG.getConstant(4, MVT::i32)); 9562 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9563 // a += a 9564 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9565 9566 C = ConstantVector::get(CVM2); 9567 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9568 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9569 MachinePointerInfo::getConstantPool(), 9570 false, false, 16); 9571 9572 // r = pblendv(r, psllw(r & (char16)63, 2), a); 9573 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9574 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9575 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9576 DAG.getConstant(2, MVT::i32)); 9577 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9578 // a += a 9579 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9580 9581 // return pblendv(r, r+r, a); 9582 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 9583 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 9584 return R; 9585 } 9586 return SDValue(); 9587} 9588 9589SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 9590 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 9591 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 9592 // looks for this combo and may remove the "setcc" instruction if the "setcc" 9593 // has only one use. 9594 SDNode *N = Op.getNode(); 9595 SDValue LHS = N->getOperand(0); 9596 SDValue RHS = N->getOperand(1); 9597 unsigned BaseOp = 0; 9598 unsigned Cond = 0; 9599 DebugLoc DL = Op.getDebugLoc(); 9600 switch (Op.getOpcode()) { 9601 default: llvm_unreachable("Unknown ovf instruction!"); 9602 case ISD::SADDO: 9603 // A subtract of one will be selected as a INC. Note that INC doesn't 9604 // set CF, so we can't do this for UADDO. 9605 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9606 if (C->isOne()) { 9607 BaseOp = X86ISD::INC; 9608 Cond = X86::COND_O; 9609 break; 9610 } 9611 BaseOp = X86ISD::ADD; 9612 Cond = X86::COND_O; 9613 break; 9614 case ISD::UADDO: 9615 BaseOp = X86ISD::ADD; 9616 Cond = X86::COND_B; 9617 break; 9618 case ISD::SSUBO: 9619 // A subtract of one will be selected as a DEC. Note that DEC doesn't 9620 // set CF, so we can't do this for USUBO. 9621 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9622 if (C->isOne()) { 9623 BaseOp = X86ISD::DEC; 9624 Cond = X86::COND_O; 9625 break; 9626 } 9627 BaseOp = X86ISD::SUB; 9628 Cond = X86::COND_O; 9629 break; 9630 case ISD::USUBO: 9631 BaseOp = X86ISD::SUB; 9632 Cond = X86::COND_B; 9633 break; 9634 case ISD::SMULO: 9635 BaseOp = X86ISD::SMUL; 9636 Cond = X86::COND_O; 9637 break; 9638 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 9639 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 9640 MVT::i32); 9641 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 9642 9643 SDValue SetCC = 9644 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9645 DAG.getConstant(X86::COND_O, MVT::i32), 9646 SDValue(Sum.getNode(), 2)); 9647 9648 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9649 } 9650 } 9651 9652 // Also sets EFLAGS. 9653 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 9654 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 9655 9656 SDValue SetCC = 9657 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 9658 DAG.getConstant(Cond, MVT::i32), 9659 SDValue(Sum.getNode(), 1)); 9660 9661 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9662} 9663 9664SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ 9665 DebugLoc dl = Op.getDebugLoc(); 9666 SDNode* Node = Op.getNode(); 9667 EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); 9668 EVT VT = Node->getValueType(0); 9669 9670 if (Subtarget->hasSSE2() && VT.isVector()) { 9671 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 9672 ExtraVT.getScalarType().getSizeInBits(); 9673 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 9674 9675 unsigned SHLIntrinsicsID = 0; 9676 unsigned SRAIntrinsicsID = 0; 9677 switch (VT.getSimpleVT().SimpleTy) { 9678 default: 9679 return SDValue(); 9680 case MVT::v2i64: { 9681 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q; 9682 SRAIntrinsicsID = 0; 9683 break; 9684 } 9685 case MVT::v4i32: { 9686 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 9687 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 9688 break; 9689 } 9690 case MVT::v8i16: { 9691 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 9692 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 9693 break; 9694 } 9695 } 9696 9697 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9698 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 9699 Node->getOperand(0), ShAmt); 9700 9701 // In case of 1 bit sext, no need to shr 9702 if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; 9703 9704 if (SRAIntrinsicsID) { 9705 Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9706 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 9707 Tmp1, ShAmt); 9708 } 9709 return Tmp1; 9710 } 9711 9712 return SDValue(); 9713} 9714 9715 9716SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 9717 DebugLoc dl = Op.getDebugLoc(); 9718 9719 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 9720 // There isn't any reason to disable it if the target processor supports it. 9721 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 9722 SDValue Chain = Op.getOperand(0); 9723 SDValue Zero = DAG.getConstant(0, MVT::i32); 9724 SDValue Ops[] = { 9725 DAG.getRegister(X86::ESP, MVT::i32), // Base 9726 DAG.getTargetConstant(1, MVT::i8), // Scale 9727 DAG.getRegister(0, MVT::i32), // Index 9728 DAG.getTargetConstant(0, MVT::i32), // Disp 9729 DAG.getRegister(0, MVT::i32), // Segment. 9730 Zero, 9731 Chain 9732 }; 9733 SDNode *Res = 9734 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9735 array_lengthof(Ops)); 9736 return SDValue(Res, 0); 9737 } 9738 9739 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 9740 if (!isDev) 9741 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9742 9743 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 9744 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 9745 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 9746 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 9747 9748 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 9749 if (!Op1 && !Op2 && !Op3 && Op4) 9750 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 9751 9752 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 9753 if (Op1 && !Op2 && !Op3 && !Op4) 9754 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 9755 9756 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 9757 // (MFENCE)>; 9758 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9759} 9760 9761SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 9762 SelectionDAG &DAG) const { 9763 DebugLoc dl = Op.getDebugLoc(); 9764 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 9765 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 9766 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 9767 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 9768 9769 // The only fence that needs an instruction is a sequentially-consistent 9770 // cross-thread fence. 9771 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 9772 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 9773 // no-sse2). There isn't any reason to disable it if the target processor 9774 // supports it. 9775 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 9776 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9777 9778 SDValue Chain = Op.getOperand(0); 9779 SDValue Zero = DAG.getConstant(0, MVT::i32); 9780 SDValue Ops[] = { 9781 DAG.getRegister(X86::ESP, MVT::i32), // Base 9782 DAG.getTargetConstant(1, MVT::i8), // Scale 9783 DAG.getRegister(0, MVT::i32), // Index 9784 DAG.getTargetConstant(0, MVT::i32), // Disp 9785 DAG.getRegister(0, MVT::i32), // Segment. 9786 Zero, 9787 Chain 9788 }; 9789 SDNode *Res = 9790 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9791 array_lengthof(Ops)); 9792 return SDValue(Res, 0); 9793 } 9794 9795 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 9796 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9797} 9798 9799 9800SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 9801 EVT T = Op.getValueType(); 9802 DebugLoc DL = Op.getDebugLoc(); 9803 unsigned Reg = 0; 9804 unsigned size = 0; 9805 switch(T.getSimpleVT().SimpleTy) { 9806 default: 9807 assert(false && "Invalid value type!"); 9808 case MVT::i8: Reg = X86::AL; size = 1; break; 9809 case MVT::i16: Reg = X86::AX; size = 2; break; 9810 case MVT::i32: Reg = X86::EAX; size = 4; break; 9811 case MVT::i64: 9812 assert(Subtarget->is64Bit() && "Node not type legal!"); 9813 Reg = X86::RAX; size = 8; 9814 break; 9815 } 9816 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 9817 Op.getOperand(2), SDValue()); 9818 SDValue Ops[] = { cpIn.getValue(0), 9819 Op.getOperand(1), 9820 Op.getOperand(3), 9821 DAG.getTargetConstant(size, MVT::i8), 9822 cpIn.getValue(1) }; 9823 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9824 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 9825 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 9826 Ops, 5, T, MMO); 9827 SDValue cpOut = 9828 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 9829 return cpOut; 9830} 9831 9832SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 9833 SelectionDAG &DAG) const { 9834 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9835 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9836 SDValue TheChain = Op.getOperand(0); 9837 DebugLoc dl = Op.getDebugLoc(); 9838 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9839 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9840 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9841 rax.getValue(2)); 9842 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9843 DAG.getConstant(32, MVT::i8)); 9844 SDValue Ops[] = { 9845 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9846 rdx.getValue(1) 9847 }; 9848 return DAG.getMergeValues(Ops, 2, dl); 9849} 9850 9851SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9852 SelectionDAG &DAG) const { 9853 EVT SrcVT = Op.getOperand(0).getValueType(); 9854 EVT DstVT = Op.getValueType(); 9855 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9856 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9857 assert((DstVT == MVT::i64 || 9858 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9859 "Unexpected custom BITCAST"); 9860 // i64 <=> MMX conversions are Legal. 9861 if (SrcVT==MVT::i64 && DstVT.isVector()) 9862 return Op; 9863 if (DstVT==MVT::i64 && SrcVT.isVector()) 9864 return Op; 9865 // MMX <=> MMX conversions are Legal. 9866 if (SrcVT.isVector() && DstVT.isVector()) 9867 return Op; 9868 // All other conversions need to be expanded. 9869 return SDValue(); 9870} 9871 9872SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9873 SDNode *Node = Op.getNode(); 9874 DebugLoc dl = Node->getDebugLoc(); 9875 EVT T = Node->getValueType(0); 9876 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9877 DAG.getConstant(0, T), Node->getOperand(2)); 9878 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9879 cast<AtomicSDNode>(Node)->getMemoryVT(), 9880 Node->getOperand(0), 9881 Node->getOperand(1), negOp, 9882 cast<AtomicSDNode>(Node)->getSrcValue(), 9883 cast<AtomicSDNode>(Node)->getAlignment(), 9884 cast<AtomicSDNode>(Node)->getOrdering(), 9885 cast<AtomicSDNode>(Node)->getSynchScope()); 9886} 9887 9888static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9889 EVT VT = Op.getNode()->getValueType(0); 9890 9891 // Let legalize expand this if it isn't a legal type yet. 9892 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9893 return SDValue(); 9894 9895 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9896 9897 unsigned Opc; 9898 bool ExtraOp = false; 9899 switch (Op.getOpcode()) { 9900 default: assert(0 && "Invalid code"); 9901 case ISD::ADDC: Opc = X86ISD::ADD; break; 9902 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9903 case ISD::SUBC: Opc = X86ISD::SUB; break; 9904 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9905 } 9906 9907 if (!ExtraOp) 9908 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9909 Op.getOperand(1)); 9910 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9911 Op.getOperand(1), Op.getOperand(2)); 9912} 9913 9914/// LowerOperation - Provide custom lowering hooks for some operations. 9915/// 9916SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9917 switch (Op.getOpcode()) { 9918 default: llvm_unreachable("Should not custom lower this!"); 9919 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 9920 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9921 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 9922 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9923 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9924 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9925 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9926 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9927 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9928 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9929 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9930 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9931 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9932 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9933 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9934 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9935 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9936 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9937 case ISD::SHL_PARTS: 9938 case ISD::SRA_PARTS: 9939 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 9940 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9941 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9942 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9943 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9944 case ISD::FABS: return LowerFABS(Op, DAG); 9945 case ISD::FNEG: return LowerFNEG(Op, DAG); 9946 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9947 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 9948 case ISD::SETCC: return LowerSETCC(Op, DAG); 9949 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9950 case ISD::SELECT: return LowerSELECT(Op, DAG); 9951 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9952 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9953 case ISD::VASTART: return LowerVASTART(Op, DAG); 9954 case ISD::VAARG: return LowerVAARG(Op, DAG); 9955 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9956 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9957 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9958 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9959 case ISD::FRAME_TO_ARGS_OFFSET: 9960 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9961 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9962 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9963 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9964 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9965 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9966 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9967 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9968 case ISD::SRA: 9969 case ISD::SRL: 9970 case ISD::SHL: return LowerShift(Op, DAG); 9971 case ISD::SADDO: 9972 case ISD::UADDO: 9973 case ISD::SSUBO: 9974 case ISD::USUBO: 9975 case ISD::SMULO: 9976 case ISD::UMULO: return LowerXALUO(Op, DAG); 9977 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9978 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9979 case ISD::ADDC: 9980 case ISD::ADDE: 9981 case ISD::SUBC: 9982 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9983 } 9984} 9985 9986void X86TargetLowering:: 9987ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9988 SelectionDAG &DAG, unsigned NewOp) const { 9989 EVT T = Node->getValueType(0); 9990 DebugLoc dl = Node->getDebugLoc(); 9991 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9992 9993 SDValue Chain = Node->getOperand(0); 9994 SDValue In1 = Node->getOperand(1); 9995 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9996 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9997 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9998 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9999 SDValue Ops[] = { Chain, In1, In2L, In2H }; 10000 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 10001 SDValue Result = 10002 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 10003 cast<MemSDNode>(Node)->getMemOperand()); 10004 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 10005 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 10006 Results.push_back(Result.getValue(2)); 10007} 10008 10009/// ReplaceNodeResults - Replace a node with an illegal result type 10010/// with a new node built out of custom code. 10011void X86TargetLowering::ReplaceNodeResults(SDNode *N, 10012 SmallVectorImpl<SDValue>&Results, 10013 SelectionDAG &DAG) const { 10014 DebugLoc dl = N->getDebugLoc(); 10015 switch (N->getOpcode()) { 10016 default: 10017 assert(false && "Do not know how to custom type legalize this operation!"); 10018 return; 10019 case ISD::SIGN_EXTEND_INREG: 10020 case ISD::ADDC: 10021 case ISD::ADDE: 10022 case ISD::SUBC: 10023 case ISD::SUBE: 10024 // We don't want to expand or promote these. 10025 return; 10026 case ISD::FP_TO_SINT: { 10027 std::pair<SDValue,SDValue> Vals = 10028 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 10029 SDValue FIST = Vals.first, StackSlot = Vals.second; 10030 if (FIST.getNode() != 0) { 10031 EVT VT = N->getValueType(0); 10032 // Return a load from the stack slot. 10033 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 10034 MachinePointerInfo(), false, false, 0)); 10035 } 10036 return; 10037 } 10038 case ISD::READCYCLECOUNTER: { 10039 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10040 SDValue TheChain = N->getOperand(0); 10041 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10042 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 10043 rd.getValue(1)); 10044 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 10045 eax.getValue(2)); 10046 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 10047 SDValue Ops[] = { eax, edx }; 10048 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 10049 Results.push_back(edx.getValue(1)); 10050 return; 10051 } 10052 case ISD::ATOMIC_CMP_SWAP: { 10053 EVT T = N->getValueType(0); 10054 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 10055 SDValue cpInL, cpInH; 10056 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 10057 DAG.getConstant(0, MVT::i32)); 10058 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 10059 DAG.getConstant(1, MVT::i32)); 10060 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 10061 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 10062 cpInL.getValue(1)); 10063 SDValue swapInL, swapInH; 10064 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 10065 DAG.getConstant(0, MVT::i32)); 10066 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 10067 DAG.getConstant(1, MVT::i32)); 10068 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 10069 cpInH.getValue(1)); 10070 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 10071 swapInL.getValue(1)); 10072 SDValue Ops[] = { swapInH.getValue(0), 10073 N->getOperand(1), 10074 swapInH.getValue(1) }; 10075 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10076 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 10077 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 10078 Ops, 3, T, MMO); 10079 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 10080 MVT::i32, Result.getValue(1)); 10081 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 10082 MVT::i32, cpOutL.getValue(2)); 10083 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 10084 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 10085 Results.push_back(cpOutH.getValue(1)); 10086 return; 10087 } 10088 case ISD::ATOMIC_LOAD_ADD: 10089 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 10090 return; 10091 case ISD::ATOMIC_LOAD_AND: 10092 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 10093 return; 10094 case ISD::ATOMIC_LOAD_NAND: 10095 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 10096 return; 10097 case ISD::ATOMIC_LOAD_OR: 10098 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 10099 return; 10100 case ISD::ATOMIC_LOAD_SUB: 10101 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 10102 return; 10103 case ISD::ATOMIC_LOAD_XOR: 10104 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 10105 return; 10106 case ISD::ATOMIC_SWAP: 10107 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 10108 return; 10109 } 10110} 10111 10112const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 10113 switch (Opcode) { 10114 default: return NULL; 10115 case X86ISD::BSF: return "X86ISD::BSF"; 10116 case X86ISD::BSR: return "X86ISD::BSR"; 10117 case X86ISD::SHLD: return "X86ISD::SHLD"; 10118 case X86ISD::SHRD: return "X86ISD::SHRD"; 10119 case X86ISD::FAND: return "X86ISD::FAND"; 10120 case X86ISD::FOR: return "X86ISD::FOR"; 10121 case X86ISD::FXOR: return "X86ISD::FXOR"; 10122 case X86ISD::FSRL: return "X86ISD::FSRL"; 10123 case X86ISD::FILD: return "X86ISD::FILD"; 10124 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 10125 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 10126 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 10127 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 10128 case X86ISD::FLD: return "X86ISD::FLD"; 10129 case X86ISD::FST: return "X86ISD::FST"; 10130 case X86ISD::CALL: return "X86ISD::CALL"; 10131 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 10132 case X86ISD::BT: return "X86ISD::BT"; 10133 case X86ISD::CMP: return "X86ISD::CMP"; 10134 case X86ISD::COMI: return "X86ISD::COMI"; 10135 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 10136 case X86ISD::SETCC: return "X86ISD::SETCC"; 10137 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 10138 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 10139 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 10140 case X86ISD::CMOV: return "X86ISD::CMOV"; 10141 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 10142 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 10143 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 10144 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 10145 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 10146 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 10147 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 10148 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 10149 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 10150 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 10151 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 10152 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 10153 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 10154 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 10155 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 10156 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 10157 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 10158 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 10159 case X86ISD::FMAX: return "X86ISD::FMAX"; 10160 case X86ISD::FMIN: return "X86ISD::FMIN"; 10161 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 10162 case X86ISD::FRCP: return "X86ISD::FRCP"; 10163 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 10164 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 10165 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 10166 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 10167 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 10168 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 10169 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 10170 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 10171 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 10172 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 10173 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 10174 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 10175 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 10176 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 10177 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 10178 case X86ISD::VSHL: return "X86ISD::VSHL"; 10179 case X86ISD::VSRL: return "X86ISD::VSRL"; 10180 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 10181 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 10182 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 10183 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 10184 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 10185 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 10186 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 10187 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 10188 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 10189 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 10190 case X86ISD::ADD: return "X86ISD::ADD"; 10191 case X86ISD::SUB: return "X86ISD::SUB"; 10192 case X86ISD::ADC: return "X86ISD::ADC"; 10193 case X86ISD::SBB: return "X86ISD::SBB"; 10194 case X86ISD::SMUL: return "X86ISD::SMUL"; 10195 case X86ISD::UMUL: return "X86ISD::UMUL"; 10196 case X86ISD::INC: return "X86ISD::INC"; 10197 case X86ISD::DEC: return "X86ISD::DEC"; 10198 case X86ISD::OR: return "X86ISD::OR"; 10199 case X86ISD::XOR: return "X86ISD::XOR"; 10200 case X86ISD::AND: return "X86ISD::AND"; 10201 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 10202 case X86ISD::PTEST: return "X86ISD::PTEST"; 10203 case X86ISD::TESTP: return "X86ISD::TESTP"; 10204 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 10205 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 10206 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 10207 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 10208 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 10209 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 10210 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 10211 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 10212 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 10213 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 10214 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 10215 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 10216 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 10217 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 10218 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 10219 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 10220 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 10221 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 10222 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 10223 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 10224 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 10225 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 10226 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 10227 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 10228 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 10229 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 10230 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 10231 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 10232 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 10233 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 10234 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 10235 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 10236 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 10237 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 10238 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 10239 case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; 10240 case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; 10241 case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; 10242 case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; 10243 case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; 10244 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 10245 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 10246 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 10247 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 10248 } 10249} 10250 10251// isLegalAddressingMode - Return true if the addressing mode represented 10252// by AM is legal for this target, for a load/store of the specified type. 10253bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 10254 Type *Ty) const { 10255 // X86 supports extremely general addressing modes. 10256 CodeModel::Model M = getTargetMachine().getCodeModel(); 10257 Reloc::Model R = getTargetMachine().getRelocationModel(); 10258 10259 // X86 allows a sign-extended 32-bit immediate field as a displacement. 10260 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 10261 return false; 10262 10263 if (AM.BaseGV) { 10264 unsigned GVFlags = 10265 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 10266 10267 // If a reference to this global requires an extra load, we can't fold it. 10268 if (isGlobalStubReference(GVFlags)) 10269 return false; 10270 10271 // If BaseGV requires a register for the PIC base, we cannot also have a 10272 // BaseReg specified. 10273 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 10274 return false; 10275 10276 // If lower 4G is not available, then we must use rip-relative addressing. 10277 if ((M != CodeModel::Small || R != Reloc::Static) && 10278 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 10279 return false; 10280 } 10281 10282 switch (AM.Scale) { 10283 case 0: 10284 case 1: 10285 case 2: 10286 case 4: 10287 case 8: 10288 // These scales always work. 10289 break; 10290 case 3: 10291 case 5: 10292 case 9: 10293 // These scales are formed with basereg+scalereg. Only accept if there is 10294 // no basereg yet. 10295 if (AM.HasBaseReg) 10296 return false; 10297 break; 10298 default: // Other stuff never works. 10299 return false; 10300 } 10301 10302 return true; 10303} 10304 10305 10306bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 10307 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 10308 return false; 10309 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 10310 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 10311 if (NumBits1 <= NumBits2) 10312 return false; 10313 return true; 10314} 10315 10316bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 10317 if (!VT1.isInteger() || !VT2.isInteger()) 10318 return false; 10319 unsigned NumBits1 = VT1.getSizeInBits(); 10320 unsigned NumBits2 = VT2.getSizeInBits(); 10321 if (NumBits1 <= NumBits2) 10322 return false; 10323 return true; 10324} 10325 10326bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 10327 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10328 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 10329} 10330 10331bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 10332 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 10333 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 10334} 10335 10336bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 10337 // i16 instructions are longer (0x66 prefix) and potentially slower. 10338 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 10339} 10340 10341/// isShuffleMaskLegal - Targets can use this to indicate that they only 10342/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 10343/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 10344/// are assumed to be legal. 10345bool 10346X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 10347 EVT VT) const { 10348 // Very little shuffling can be done for 64-bit vectors right now. 10349 if (VT.getSizeInBits() == 64) 10350 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 10351 10352 // FIXME: pshufb, blends, shifts. 10353 return (VT.getVectorNumElements() == 2 || 10354 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 10355 isMOVLMask(M, VT) || 10356 isSHUFPMask(M, VT) || 10357 isPSHUFDMask(M, VT) || 10358 isPSHUFHWMask(M, VT) || 10359 isPSHUFLWMask(M, VT) || 10360 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 10361 isUNPCKLMask(M, VT) || 10362 isUNPCKHMask(M, VT) || 10363 isUNPCKL_v_undef_Mask(M, VT) || 10364 isUNPCKH_v_undef_Mask(M, VT)); 10365} 10366 10367bool 10368X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 10369 EVT VT) const { 10370 unsigned NumElts = VT.getVectorNumElements(); 10371 // FIXME: This collection of masks seems suspect. 10372 if (NumElts == 2) 10373 return true; 10374 if (NumElts == 4 && VT.getSizeInBits() == 128) { 10375 return (isMOVLMask(Mask, VT) || 10376 isCommutedMOVLMask(Mask, VT, true) || 10377 isSHUFPMask(Mask, VT) || 10378 isCommutedSHUFPMask(Mask, VT)); 10379 } 10380 return false; 10381} 10382 10383//===----------------------------------------------------------------------===// 10384// X86 Scheduler Hooks 10385//===----------------------------------------------------------------------===// 10386 10387// private utility function 10388MachineBasicBlock * 10389X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 10390 MachineBasicBlock *MBB, 10391 unsigned regOpc, 10392 unsigned immOpc, 10393 unsigned LoadOpc, 10394 unsigned CXchgOpc, 10395 unsigned notOpc, 10396 unsigned EAXreg, 10397 TargetRegisterClass *RC, 10398 bool invSrc) const { 10399 // For the atomic bitwise operator, we generate 10400 // thisMBB: 10401 // newMBB: 10402 // ld t1 = [bitinstr.addr] 10403 // op t2 = t1, [bitinstr.val] 10404 // mov EAX = t1 10405 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10406 // bz newMBB 10407 // fallthrough -->nextMBB 10408 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10409 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10410 MachineFunction::iterator MBBIter = MBB; 10411 ++MBBIter; 10412 10413 /// First build the CFG 10414 MachineFunction *F = MBB->getParent(); 10415 MachineBasicBlock *thisMBB = MBB; 10416 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10417 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10418 F->insert(MBBIter, newMBB); 10419 F->insert(MBBIter, nextMBB); 10420 10421 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10422 nextMBB->splice(nextMBB->begin(), thisMBB, 10423 llvm::next(MachineBasicBlock::iterator(bInstr)), 10424 thisMBB->end()); 10425 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10426 10427 // Update thisMBB to fall through to newMBB 10428 thisMBB->addSuccessor(newMBB); 10429 10430 // newMBB jumps to itself and fall through to nextMBB 10431 newMBB->addSuccessor(nextMBB); 10432 newMBB->addSuccessor(newMBB); 10433 10434 // Insert instructions into newMBB based on incoming instruction 10435 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10436 "unexpected number of operands"); 10437 DebugLoc dl = bInstr->getDebugLoc(); 10438 MachineOperand& destOper = bInstr->getOperand(0); 10439 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10440 int numArgs = bInstr->getNumOperands() - 1; 10441 for (int i=0; i < numArgs; ++i) 10442 argOpers[i] = &bInstr->getOperand(i+1); 10443 10444 // x86 address has 4 operands: base, index, scale, and displacement 10445 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10446 int valArgIndx = lastAddrIndx + 1; 10447 10448 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10449 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 10450 for (int i=0; i <= lastAddrIndx; ++i) 10451 (*MIB).addOperand(*argOpers[i]); 10452 10453 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 10454 if (invSrc) { 10455 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 10456 } 10457 else 10458 tt = t1; 10459 10460 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10461 assert((argOpers[valArgIndx]->isReg() || 10462 argOpers[valArgIndx]->isImm()) && 10463 "invalid operand"); 10464 if (argOpers[valArgIndx]->isReg()) 10465 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 10466 else 10467 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 10468 MIB.addReg(tt); 10469 (*MIB).addOperand(*argOpers[valArgIndx]); 10470 10471 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 10472 MIB.addReg(t1); 10473 10474 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 10475 for (int i=0; i <= lastAddrIndx; ++i) 10476 (*MIB).addOperand(*argOpers[i]); 10477 MIB.addReg(t2); 10478 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10479 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10480 bInstr->memoperands_end()); 10481 10482 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10483 MIB.addReg(EAXreg); 10484 10485 // insert branch 10486 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10487 10488 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10489 return nextMBB; 10490} 10491 10492// private utility function: 64 bit atomics on 32 bit host. 10493MachineBasicBlock * 10494X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 10495 MachineBasicBlock *MBB, 10496 unsigned regOpcL, 10497 unsigned regOpcH, 10498 unsigned immOpcL, 10499 unsigned immOpcH, 10500 bool invSrc) const { 10501 // For the atomic bitwise operator, we generate 10502 // thisMBB (instructions are in pairs, except cmpxchg8b) 10503 // ld t1,t2 = [bitinstr.addr] 10504 // newMBB: 10505 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 10506 // op t5, t6 <- out1, out2, [bitinstr.val] 10507 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 10508 // mov ECX, EBX <- t5, t6 10509 // mov EAX, EDX <- t1, t2 10510 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 10511 // mov t3, t4 <- EAX, EDX 10512 // bz newMBB 10513 // result in out1, out2 10514 // fallthrough -->nextMBB 10515 10516 const TargetRegisterClass *RC = X86::GR32RegisterClass; 10517 const unsigned LoadOpc = X86::MOV32rm; 10518 const unsigned NotOpc = X86::NOT32r; 10519 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10520 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10521 MachineFunction::iterator MBBIter = MBB; 10522 ++MBBIter; 10523 10524 /// First build the CFG 10525 MachineFunction *F = MBB->getParent(); 10526 MachineBasicBlock *thisMBB = MBB; 10527 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10528 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10529 F->insert(MBBIter, newMBB); 10530 F->insert(MBBIter, nextMBB); 10531 10532 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10533 nextMBB->splice(nextMBB->begin(), thisMBB, 10534 llvm::next(MachineBasicBlock::iterator(bInstr)), 10535 thisMBB->end()); 10536 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10537 10538 // Update thisMBB to fall through to newMBB 10539 thisMBB->addSuccessor(newMBB); 10540 10541 // newMBB jumps to itself and fall through to nextMBB 10542 newMBB->addSuccessor(nextMBB); 10543 newMBB->addSuccessor(newMBB); 10544 10545 DebugLoc dl = bInstr->getDebugLoc(); 10546 // Insert instructions into newMBB based on incoming instruction 10547 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 10548 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 10549 "unexpected number of operands"); 10550 MachineOperand& dest1Oper = bInstr->getOperand(0); 10551 MachineOperand& dest2Oper = bInstr->getOperand(1); 10552 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10553 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 10554 argOpers[i] = &bInstr->getOperand(i+2); 10555 10556 // We use some of the operands multiple times, so conservatively just 10557 // clear any kill flags that might be present. 10558 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 10559 argOpers[i]->setIsKill(false); 10560 } 10561 10562 // x86 address has 5 operands: base, index, scale, displacement, and segment. 10563 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10564 10565 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10566 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 10567 for (int i=0; i <= lastAddrIndx; ++i) 10568 (*MIB).addOperand(*argOpers[i]); 10569 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10570 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 10571 // add 4 to displacement. 10572 for (int i=0; i <= lastAddrIndx-2; ++i) 10573 (*MIB).addOperand(*argOpers[i]); 10574 MachineOperand newOp3 = *(argOpers[3]); 10575 if (newOp3.isImm()) 10576 newOp3.setImm(newOp3.getImm()+4); 10577 else 10578 newOp3.setOffset(newOp3.getOffset()+4); 10579 (*MIB).addOperand(newOp3); 10580 (*MIB).addOperand(*argOpers[lastAddrIndx]); 10581 10582 // t3/4 are defined later, at the bottom of the loop 10583 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 10584 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 10585 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 10586 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 10587 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 10588 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 10589 10590 // The subsequent operations should be using the destination registers of 10591 //the PHI instructions. 10592 if (invSrc) { 10593 t1 = F->getRegInfo().createVirtualRegister(RC); 10594 t2 = F->getRegInfo().createVirtualRegister(RC); 10595 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 10596 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 10597 } else { 10598 t1 = dest1Oper.getReg(); 10599 t2 = dest2Oper.getReg(); 10600 } 10601 10602 int valArgIndx = lastAddrIndx + 1; 10603 assert((argOpers[valArgIndx]->isReg() || 10604 argOpers[valArgIndx]->isImm()) && 10605 "invalid operand"); 10606 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 10607 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 10608 if (argOpers[valArgIndx]->isReg()) 10609 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 10610 else 10611 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 10612 if (regOpcL != X86::MOV32rr) 10613 MIB.addReg(t1); 10614 (*MIB).addOperand(*argOpers[valArgIndx]); 10615 assert(argOpers[valArgIndx + 1]->isReg() == 10616 argOpers[valArgIndx]->isReg()); 10617 assert(argOpers[valArgIndx + 1]->isImm() == 10618 argOpers[valArgIndx]->isImm()); 10619 if (argOpers[valArgIndx + 1]->isReg()) 10620 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 10621 else 10622 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 10623 if (regOpcH != X86::MOV32rr) 10624 MIB.addReg(t2); 10625 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 10626 10627 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10628 MIB.addReg(t1); 10629 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 10630 MIB.addReg(t2); 10631 10632 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 10633 MIB.addReg(t5); 10634 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 10635 MIB.addReg(t6); 10636 10637 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 10638 for (int i=0; i <= lastAddrIndx; ++i) 10639 (*MIB).addOperand(*argOpers[i]); 10640 10641 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10642 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10643 bInstr->memoperands_end()); 10644 10645 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 10646 MIB.addReg(X86::EAX); 10647 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 10648 MIB.addReg(X86::EDX); 10649 10650 // insert branch 10651 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10652 10653 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10654 return nextMBB; 10655} 10656 10657// private utility function 10658MachineBasicBlock * 10659X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 10660 MachineBasicBlock *MBB, 10661 unsigned cmovOpc) const { 10662 // For the atomic min/max operator, we generate 10663 // thisMBB: 10664 // newMBB: 10665 // ld t1 = [min/max.addr] 10666 // mov t2 = [min/max.val] 10667 // cmp t1, t2 10668 // cmov[cond] t2 = t1 10669 // mov EAX = t1 10670 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10671 // bz newMBB 10672 // fallthrough -->nextMBB 10673 // 10674 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10675 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10676 MachineFunction::iterator MBBIter = MBB; 10677 ++MBBIter; 10678 10679 /// First build the CFG 10680 MachineFunction *F = MBB->getParent(); 10681 MachineBasicBlock *thisMBB = MBB; 10682 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10683 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10684 F->insert(MBBIter, newMBB); 10685 F->insert(MBBIter, nextMBB); 10686 10687 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10688 nextMBB->splice(nextMBB->begin(), thisMBB, 10689 llvm::next(MachineBasicBlock::iterator(mInstr)), 10690 thisMBB->end()); 10691 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10692 10693 // Update thisMBB to fall through to newMBB 10694 thisMBB->addSuccessor(newMBB); 10695 10696 // newMBB jumps to newMBB and fall through to nextMBB 10697 newMBB->addSuccessor(nextMBB); 10698 newMBB->addSuccessor(newMBB); 10699 10700 DebugLoc dl = mInstr->getDebugLoc(); 10701 // Insert instructions into newMBB based on incoming instruction 10702 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10703 "unexpected number of operands"); 10704 MachineOperand& destOper = mInstr->getOperand(0); 10705 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10706 int numArgs = mInstr->getNumOperands() - 1; 10707 for (int i=0; i < numArgs; ++i) 10708 argOpers[i] = &mInstr->getOperand(i+1); 10709 10710 // x86 address has 4 operands: base, index, scale, and displacement 10711 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10712 int valArgIndx = lastAddrIndx + 1; 10713 10714 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10715 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 10716 for (int i=0; i <= lastAddrIndx; ++i) 10717 (*MIB).addOperand(*argOpers[i]); 10718 10719 // We only support register and immediate values 10720 assert((argOpers[valArgIndx]->isReg() || 10721 argOpers[valArgIndx]->isImm()) && 10722 "invalid operand"); 10723 10724 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10725 if (argOpers[valArgIndx]->isReg()) 10726 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 10727 else 10728 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 10729 (*MIB).addOperand(*argOpers[valArgIndx]); 10730 10731 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10732 MIB.addReg(t1); 10733 10734 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 10735 MIB.addReg(t1); 10736 MIB.addReg(t2); 10737 10738 // Generate movc 10739 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10740 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 10741 MIB.addReg(t2); 10742 MIB.addReg(t1); 10743 10744 // Cmp and exchange if none has modified the memory location 10745 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 10746 for (int i=0; i <= lastAddrIndx; ++i) 10747 (*MIB).addOperand(*argOpers[i]); 10748 MIB.addReg(t3); 10749 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10750 (*MIB).setMemRefs(mInstr->memoperands_begin(), 10751 mInstr->memoperands_end()); 10752 10753 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10754 MIB.addReg(X86::EAX); 10755 10756 // insert branch 10757 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10758 10759 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 10760 return nextMBB; 10761} 10762 10763// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 10764// or XMM0_V32I8 in AVX all of this code can be replaced with that 10765// in the .td file. 10766MachineBasicBlock * 10767X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 10768 unsigned numArgs, bool memArg) const { 10769 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 10770 "Target must have SSE4.2 or AVX features enabled"); 10771 10772 DebugLoc dl = MI->getDebugLoc(); 10773 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10774 unsigned Opc; 10775 if (!Subtarget->hasAVX()) { 10776 if (memArg) 10777 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 10778 else 10779 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 10780 } else { 10781 if (memArg) 10782 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 10783 else 10784 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 10785 } 10786 10787 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 10788 for (unsigned i = 0; i < numArgs; ++i) { 10789 MachineOperand &Op = MI->getOperand(i+1); 10790 if (!(Op.isReg() && Op.isImplicit())) 10791 MIB.addOperand(Op); 10792 } 10793 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 10794 .addReg(X86::XMM0); 10795 10796 MI->eraseFromParent(); 10797 return BB; 10798} 10799 10800MachineBasicBlock * 10801X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 10802 DebugLoc dl = MI->getDebugLoc(); 10803 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10804 10805 // Address into RAX/EAX, other two args into ECX, EDX. 10806 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 10807 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 10808 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 10809 for (int i = 0; i < X86::AddrNumOperands; ++i) 10810 MIB.addOperand(MI->getOperand(i)); 10811 10812 unsigned ValOps = X86::AddrNumOperands; 10813 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10814 .addReg(MI->getOperand(ValOps).getReg()); 10815 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 10816 .addReg(MI->getOperand(ValOps+1).getReg()); 10817 10818 // The instruction doesn't actually take any operands though. 10819 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 10820 10821 MI->eraseFromParent(); // The pseudo is gone now. 10822 return BB; 10823} 10824 10825MachineBasicBlock * 10826X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 10827 DebugLoc dl = MI->getDebugLoc(); 10828 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10829 10830 // First arg in ECX, the second in EAX. 10831 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10832 .addReg(MI->getOperand(0).getReg()); 10833 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 10834 .addReg(MI->getOperand(1).getReg()); 10835 10836 // The instruction doesn't actually take any operands though. 10837 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 10838 10839 MI->eraseFromParent(); // The pseudo is gone now. 10840 return BB; 10841} 10842 10843MachineBasicBlock * 10844X86TargetLowering::EmitVAARG64WithCustomInserter( 10845 MachineInstr *MI, 10846 MachineBasicBlock *MBB) const { 10847 // Emit va_arg instruction on X86-64. 10848 10849 // Operands to this pseudo-instruction: 10850 // 0 ) Output : destination address (reg) 10851 // 1-5) Input : va_list address (addr, i64mem) 10852 // 6 ) ArgSize : Size (in bytes) of vararg type 10853 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10854 // 8 ) Align : Alignment of type 10855 // 9 ) EFLAGS (implicit-def) 10856 10857 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10858 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10859 10860 unsigned DestReg = MI->getOperand(0).getReg(); 10861 MachineOperand &Base = MI->getOperand(1); 10862 MachineOperand &Scale = MI->getOperand(2); 10863 MachineOperand &Index = MI->getOperand(3); 10864 MachineOperand &Disp = MI->getOperand(4); 10865 MachineOperand &Segment = MI->getOperand(5); 10866 unsigned ArgSize = MI->getOperand(6).getImm(); 10867 unsigned ArgMode = MI->getOperand(7).getImm(); 10868 unsigned Align = MI->getOperand(8).getImm(); 10869 10870 // Memory Reference 10871 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10872 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10873 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10874 10875 // Machine Information 10876 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10877 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10878 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10879 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10880 DebugLoc DL = MI->getDebugLoc(); 10881 10882 // struct va_list { 10883 // i32 gp_offset 10884 // i32 fp_offset 10885 // i64 overflow_area (address) 10886 // i64 reg_save_area (address) 10887 // } 10888 // sizeof(va_list) = 24 10889 // alignment(va_list) = 8 10890 10891 unsigned TotalNumIntRegs = 6; 10892 unsigned TotalNumXMMRegs = 8; 10893 bool UseGPOffset = (ArgMode == 1); 10894 bool UseFPOffset = (ArgMode == 2); 10895 unsigned MaxOffset = TotalNumIntRegs * 8 + 10896 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10897 10898 /* Align ArgSize to a multiple of 8 */ 10899 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10900 bool NeedsAlign = (Align > 8); 10901 10902 MachineBasicBlock *thisMBB = MBB; 10903 MachineBasicBlock *overflowMBB; 10904 MachineBasicBlock *offsetMBB; 10905 MachineBasicBlock *endMBB; 10906 10907 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10908 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10909 unsigned OffsetReg = 0; 10910 10911 if (!UseGPOffset && !UseFPOffset) { 10912 // If we only pull from the overflow region, we don't create a branch. 10913 // We don't need to alter control flow. 10914 OffsetDestReg = 0; // unused 10915 OverflowDestReg = DestReg; 10916 10917 offsetMBB = NULL; 10918 overflowMBB = thisMBB; 10919 endMBB = thisMBB; 10920 } else { 10921 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10922 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10923 // If not, pull from overflow_area. (branch to overflowMBB) 10924 // 10925 // thisMBB 10926 // | . 10927 // | . 10928 // offsetMBB overflowMBB 10929 // | . 10930 // | . 10931 // endMBB 10932 10933 // Registers for the PHI in endMBB 10934 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10935 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10936 10937 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10938 MachineFunction *MF = MBB->getParent(); 10939 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10940 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10941 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10942 10943 MachineFunction::iterator MBBIter = MBB; 10944 ++MBBIter; 10945 10946 // Insert the new basic blocks 10947 MF->insert(MBBIter, offsetMBB); 10948 MF->insert(MBBIter, overflowMBB); 10949 MF->insert(MBBIter, endMBB); 10950 10951 // Transfer the remainder of MBB and its successor edges to endMBB. 10952 endMBB->splice(endMBB->begin(), thisMBB, 10953 llvm::next(MachineBasicBlock::iterator(MI)), 10954 thisMBB->end()); 10955 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10956 10957 // Make offsetMBB and overflowMBB successors of thisMBB 10958 thisMBB->addSuccessor(offsetMBB); 10959 thisMBB->addSuccessor(overflowMBB); 10960 10961 // endMBB is a successor of both offsetMBB and overflowMBB 10962 offsetMBB->addSuccessor(endMBB); 10963 overflowMBB->addSuccessor(endMBB); 10964 10965 // Load the offset value into a register 10966 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10967 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10968 .addOperand(Base) 10969 .addOperand(Scale) 10970 .addOperand(Index) 10971 .addDisp(Disp, UseFPOffset ? 4 : 0) 10972 .addOperand(Segment) 10973 .setMemRefs(MMOBegin, MMOEnd); 10974 10975 // Check if there is enough room left to pull this argument. 10976 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10977 .addReg(OffsetReg) 10978 .addImm(MaxOffset + 8 - ArgSizeA8); 10979 10980 // Branch to "overflowMBB" if offset >= max 10981 // Fall through to "offsetMBB" otherwise 10982 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10983 .addMBB(overflowMBB); 10984 } 10985 10986 // In offsetMBB, emit code to use the reg_save_area. 10987 if (offsetMBB) { 10988 assert(OffsetReg != 0); 10989 10990 // Read the reg_save_area address. 10991 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10992 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10993 .addOperand(Base) 10994 .addOperand(Scale) 10995 .addOperand(Index) 10996 .addDisp(Disp, 16) 10997 .addOperand(Segment) 10998 .setMemRefs(MMOBegin, MMOEnd); 10999 11000 // Zero-extend the offset 11001 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 11002 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 11003 .addImm(0) 11004 .addReg(OffsetReg) 11005 .addImm(X86::sub_32bit); 11006 11007 // Add the offset to the reg_save_area to get the final address. 11008 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 11009 .addReg(OffsetReg64) 11010 .addReg(RegSaveReg); 11011 11012 // Compute the offset for the next argument 11013 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11014 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 11015 .addReg(OffsetReg) 11016 .addImm(UseFPOffset ? 16 : 8); 11017 11018 // Store it back into the va_list. 11019 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 11020 .addOperand(Base) 11021 .addOperand(Scale) 11022 .addOperand(Index) 11023 .addDisp(Disp, UseFPOffset ? 4 : 0) 11024 .addOperand(Segment) 11025 .addReg(NextOffsetReg) 11026 .setMemRefs(MMOBegin, MMOEnd); 11027 11028 // Jump to endMBB 11029 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 11030 .addMBB(endMBB); 11031 } 11032 11033 // 11034 // Emit code to use overflow area 11035 // 11036 11037 // Load the overflow_area address into a register. 11038 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 11039 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 11040 .addOperand(Base) 11041 .addOperand(Scale) 11042 .addOperand(Index) 11043 .addDisp(Disp, 8) 11044 .addOperand(Segment) 11045 .setMemRefs(MMOBegin, MMOEnd); 11046 11047 // If we need to align it, do so. Otherwise, just copy the address 11048 // to OverflowDestReg. 11049 if (NeedsAlign) { 11050 // Align the overflow address 11051 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 11052 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 11053 11054 // aligned_addr = (addr + (align-1)) & ~(align-1) 11055 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 11056 .addReg(OverflowAddrReg) 11057 .addImm(Align-1); 11058 11059 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 11060 .addReg(TmpReg) 11061 .addImm(~(uint64_t)(Align-1)); 11062 } else { 11063 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 11064 .addReg(OverflowAddrReg); 11065 } 11066 11067 // Compute the next overflow address after this argument. 11068 // (the overflow address should be kept 8-byte aligned) 11069 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 11070 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 11071 .addReg(OverflowDestReg) 11072 .addImm(ArgSizeA8); 11073 11074 // Store the new overflow address. 11075 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 11076 .addOperand(Base) 11077 .addOperand(Scale) 11078 .addOperand(Index) 11079 .addDisp(Disp, 8) 11080 .addOperand(Segment) 11081 .addReg(NextAddrReg) 11082 .setMemRefs(MMOBegin, MMOEnd); 11083 11084 // If we branched, emit the PHI to the front of endMBB. 11085 if (offsetMBB) { 11086 BuildMI(*endMBB, endMBB->begin(), DL, 11087 TII->get(X86::PHI), DestReg) 11088 .addReg(OffsetDestReg).addMBB(offsetMBB) 11089 .addReg(OverflowDestReg).addMBB(overflowMBB); 11090 } 11091 11092 // Erase the pseudo instruction 11093 MI->eraseFromParent(); 11094 11095 return endMBB; 11096} 11097 11098MachineBasicBlock * 11099X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 11100 MachineInstr *MI, 11101 MachineBasicBlock *MBB) const { 11102 // Emit code to save XMM registers to the stack. The ABI says that the 11103 // number of registers to save is given in %al, so it's theoretically 11104 // possible to do an indirect jump trick to avoid saving all of them, 11105 // however this code takes a simpler approach and just executes all 11106 // of the stores if %al is non-zero. It's less code, and it's probably 11107 // easier on the hardware branch predictor, and stores aren't all that 11108 // expensive anyway. 11109 11110 // Create the new basic blocks. One block contains all the XMM stores, 11111 // and one block is the final destination regardless of whether any 11112 // stores were performed. 11113 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11114 MachineFunction *F = MBB->getParent(); 11115 MachineFunction::iterator MBBIter = MBB; 11116 ++MBBIter; 11117 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 11118 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 11119 F->insert(MBBIter, XMMSaveMBB); 11120 F->insert(MBBIter, EndMBB); 11121 11122 // Transfer the remainder of MBB and its successor edges to EndMBB. 11123 EndMBB->splice(EndMBB->begin(), MBB, 11124 llvm::next(MachineBasicBlock::iterator(MI)), 11125 MBB->end()); 11126 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 11127 11128 // The original block will now fall through to the XMM save block. 11129 MBB->addSuccessor(XMMSaveMBB); 11130 // The XMMSaveMBB will fall through to the end block. 11131 XMMSaveMBB->addSuccessor(EndMBB); 11132 11133 // Now add the instructions. 11134 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11135 DebugLoc DL = MI->getDebugLoc(); 11136 11137 unsigned CountReg = MI->getOperand(0).getReg(); 11138 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 11139 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 11140 11141 if (!Subtarget->isTargetWin64()) { 11142 // If %al is 0, branch around the XMM save block. 11143 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 11144 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 11145 MBB->addSuccessor(EndMBB); 11146 } 11147 11148 // In the XMM save block, save all the XMM argument registers. 11149 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 11150 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 11151 MachineMemOperand *MMO = 11152 F->getMachineMemOperand( 11153 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 11154 MachineMemOperand::MOStore, 11155 /*Size=*/16, /*Align=*/16); 11156 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 11157 .addFrameIndex(RegSaveFrameIndex) 11158 .addImm(/*Scale=*/1) 11159 .addReg(/*IndexReg=*/0) 11160 .addImm(/*Disp=*/Offset) 11161 .addReg(/*Segment=*/0) 11162 .addReg(MI->getOperand(i).getReg()) 11163 .addMemOperand(MMO); 11164 } 11165 11166 MI->eraseFromParent(); // The pseudo instruction is gone now. 11167 11168 return EndMBB; 11169} 11170 11171MachineBasicBlock * 11172X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 11173 MachineBasicBlock *BB) const { 11174 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11175 DebugLoc DL = MI->getDebugLoc(); 11176 11177 // To "insert" a SELECT_CC instruction, we actually have to insert the 11178 // diamond control-flow pattern. The incoming instruction knows the 11179 // destination vreg to set, the condition code register to branch on, the 11180 // true/false values to select between, and a branch opcode to use. 11181 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11182 MachineFunction::iterator It = BB; 11183 ++It; 11184 11185 // thisMBB: 11186 // ... 11187 // TrueVal = ... 11188 // cmpTY ccX, r1, r2 11189 // bCC copy1MBB 11190 // fallthrough --> copy0MBB 11191 MachineBasicBlock *thisMBB = BB; 11192 MachineFunction *F = BB->getParent(); 11193 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11194 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11195 F->insert(It, copy0MBB); 11196 F->insert(It, sinkMBB); 11197 11198 // If the EFLAGS register isn't dead in the terminator, then claim that it's 11199 // live into the sink and copy blocks. 11200 const MachineFunction *MF = BB->getParent(); 11201 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 11202 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 11203 11204 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 11205 const MachineOperand &MO = MI->getOperand(I); 11206 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 11207 unsigned Reg = MO.getReg(); 11208 if (Reg != X86::EFLAGS) continue; 11209 copy0MBB->addLiveIn(Reg); 11210 sinkMBB->addLiveIn(Reg); 11211 } 11212 11213 // Transfer the remainder of BB and its successor edges to sinkMBB. 11214 sinkMBB->splice(sinkMBB->begin(), BB, 11215 llvm::next(MachineBasicBlock::iterator(MI)), 11216 BB->end()); 11217 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11218 11219 // Add the true and fallthrough blocks as its successors. 11220 BB->addSuccessor(copy0MBB); 11221 BB->addSuccessor(sinkMBB); 11222 11223 // Create the conditional branch instruction. 11224 unsigned Opc = 11225 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 11226 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 11227 11228 // copy0MBB: 11229 // %FalseValue = ... 11230 // # fallthrough to sinkMBB 11231 copy0MBB->addSuccessor(sinkMBB); 11232 11233 // sinkMBB: 11234 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11235 // ... 11236 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 11237 TII->get(X86::PHI), MI->getOperand(0).getReg()) 11238 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 11239 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 11240 11241 MI->eraseFromParent(); // The pseudo instruction is gone now. 11242 return sinkMBB; 11243} 11244 11245MachineBasicBlock * 11246X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 11247 MachineBasicBlock *BB) const { 11248 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11249 DebugLoc DL = MI->getDebugLoc(); 11250 11251 assert(!Subtarget->isTargetEnvMacho()); 11252 11253 // The lowering is pretty easy: we're just emitting the call to _alloca. The 11254 // non-trivial part is impdef of ESP. 11255 11256 if (Subtarget->isTargetWin64()) { 11257 if (Subtarget->isTargetCygMing()) { 11258 // ___chkstk(Mingw64): 11259 // Clobbers R10, R11, RAX and EFLAGS. 11260 // Updates RSP. 11261 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11262 .addExternalSymbol("___chkstk") 11263 .addReg(X86::RAX, RegState::Implicit) 11264 .addReg(X86::RSP, RegState::Implicit) 11265 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 11266 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 11267 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11268 } else { 11269 // __chkstk(MSVCRT): does not update stack pointer. 11270 // Clobbers R10, R11 and EFLAGS. 11271 // FIXME: RAX(allocated size) might be reused and not killed. 11272 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 11273 .addExternalSymbol("__chkstk") 11274 .addReg(X86::RAX, RegState::Implicit) 11275 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11276 // RAX has the offset to subtracted from RSP. 11277 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 11278 .addReg(X86::RSP) 11279 .addReg(X86::RAX); 11280 } 11281 } else { 11282 const char *StackProbeSymbol = 11283 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 11284 11285 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 11286 .addExternalSymbol(StackProbeSymbol) 11287 .addReg(X86::EAX, RegState::Implicit) 11288 .addReg(X86::ESP, RegState::Implicit) 11289 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 11290 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 11291 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 11292 } 11293 11294 MI->eraseFromParent(); // The pseudo instruction is gone now. 11295 return BB; 11296} 11297 11298MachineBasicBlock * 11299X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 11300 MachineBasicBlock *BB) const { 11301 // This is pretty easy. We're taking the value that we received from 11302 // our load from the relocation, sticking it in either RDI (x86-64) 11303 // or EAX and doing an indirect call. The return value will then 11304 // be in the normal return register. 11305 const X86InstrInfo *TII 11306 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 11307 DebugLoc DL = MI->getDebugLoc(); 11308 MachineFunction *F = BB->getParent(); 11309 11310 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 11311 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 11312 11313 if (Subtarget->is64Bit()) { 11314 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11315 TII->get(X86::MOV64rm), X86::RDI) 11316 .addReg(X86::RIP) 11317 .addImm(0).addReg(0) 11318 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11319 MI->getOperand(3).getTargetFlags()) 11320 .addReg(0); 11321 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 11322 addDirectMem(MIB, X86::RDI); 11323 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 11324 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11325 TII->get(X86::MOV32rm), X86::EAX) 11326 .addReg(0) 11327 .addImm(0).addReg(0) 11328 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11329 MI->getOperand(3).getTargetFlags()) 11330 .addReg(0); 11331 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11332 addDirectMem(MIB, X86::EAX); 11333 } else { 11334 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 11335 TII->get(X86::MOV32rm), X86::EAX) 11336 .addReg(TII->getGlobalBaseReg(F)) 11337 .addImm(0).addReg(0) 11338 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 11339 MI->getOperand(3).getTargetFlags()) 11340 .addReg(0); 11341 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 11342 addDirectMem(MIB, X86::EAX); 11343 } 11344 11345 MI->eraseFromParent(); // The pseudo instruction is gone now. 11346 return BB; 11347} 11348 11349MachineBasicBlock * 11350X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 11351 MachineBasicBlock *BB) const { 11352 switch (MI->getOpcode()) { 11353 default: assert(false && "Unexpected instr type to insert"); 11354 case X86::TAILJMPd64: 11355 case X86::TAILJMPr64: 11356 case X86::TAILJMPm64: 11357 assert(!"TAILJMP64 would not be touched here."); 11358 case X86::TCRETURNdi64: 11359 case X86::TCRETURNri64: 11360 case X86::TCRETURNmi64: 11361 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 11362 // On AMD64, additional defs should be added before register allocation. 11363 if (!Subtarget->isTargetWin64()) { 11364 MI->addRegisterDefined(X86::RSI); 11365 MI->addRegisterDefined(X86::RDI); 11366 MI->addRegisterDefined(X86::XMM6); 11367 MI->addRegisterDefined(X86::XMM7); 11368 MI->addRegisterDefined(X86::XMM8); 11369 MI->addRegisterDefined(X86::XMM9); 11370 MI->addRegisterDefined(X86::XMM10); 11371 MI->addRegisterDefined(X86::XMM11); 11372 MI->addRegisterDefined(X86::XMM12); 11373 MI->addRegisterDefined(X86::XMM13); 11374 MI->addRegisterDefined(X86::XMM14); 11375 MI->addRegisterDefined(X86::XMM15); 11376 } 11377 return BB; 11378 case X86::WIN_ALLOCA: 11379 return EmitLoweredWinAlloca(MI, BB); 11380 case X86::TLSCall_32: 11381 case X86::TLSCall_64: 11382 return EmitLoweredTLSCall(MI, BB); 11383 case X86::CMOV_GR8: 11384 case X86::CMOV_FR32: 11385 case X86::CMOV_FR64: 11386 case X86::CMOV_V4F32: 11387 case X86::CMOV_V2F64: 11388 case X86::CMOV_V2I64: 11389 case X86::CMOV_V8F32: 11390 case X86::CMOV_V4F64: 11391 case X86::CMOV_V4I64: 11392 case X86::CMOV_GR16: 11393 case X86::CMOV_GR32: 11394 case X86::CMOV_RFP32: 11395 case X86::CMOV_RFP64: 11396 case X86::CMOV_RFP80: 11397 return EmitLoweredSelect(MI, BB); 11398 11399 case X86::FP32_TO_INT16_IN_MEM: 11400 case X86::FP32_TO_INT32_IN_MEM: 11401 case X86::FP32_TO_INT64_IN_MEM: 11402 case X86::FP64_TO_INT16_IN_MEM: 11403 case X86::FP64_TO_INT32_IN_MEM: 11404 case X86::FP64_TO_INT64_IN_MEM: 11405 case X86::FP80_TO_INT16_IN_MEM: 11406 case X86::FP80_TO_INT32_IN_MEM: 11407 case X86::FP80_TO_INT64_IN_MEM: { 11408 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11409 DebugLoc DL = MI->getDebugLoc(); 11410 11411 // Change the floating point control register to use "round towards zero" 11412 // mode when truncating to an integer value. 11413 MachineFunction *F = BB->getParent(); 11414 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 11415 addFrameReference(BuildMI(*BB, MI, DL, 11416 TII->get(X86::FNSTCW16m)), CWFrameIdx); 11417 11418 // Load the old value of the high byte of the control word... 11419 unsigned OldCW = 11420 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 11421 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 11422 CWFrameIdx); 11423 11424 // Set the high part to be round to zero... 11425 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 11426 .addImm(0xC7F); 11427 11428 // Reload the modified control word now... 11429 addFrameReference(BuildMI(*BB, MI, DL, 11430 TII->get(X86::FLDCW16m)), CWFrameIdx); 11431 11432 // Restore the memory image of control word to original value 11433 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 11434 .addReg(OldCW); 11435 11436 // Get the X86 opcode to use. 11437 unsigned Opc; 11438 switch (MI->getOpcode()) { 11439 default: llvm_unreachable("illegal opcode!"); 11440 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 11441 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 11442 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 11443 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 11444 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 11445 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 11446 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 11447 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 11448 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 11449 } 11450 11451 X86AddressMode AM; 11452 MachineOperand &Op = MI->getOperand(0); 11453 if (Op.isReg()) { 11454 AM.BaseType = X86AddressMode::RegBase; 11455 AM.Base.Reg = Op.getReg(); 11456 } else { 11457 AM.BaseType = X86AddressMode::FrameIndexBase; 11458 AM.Base.FrameIndex = Op.getIndex(); 11459 } 11460 Op = MI->getOperand(1); 11461 if (Op.isImm()) 11462 AM.Scale = Op.getImm(); 11463 Op = MI->getOperand(2); 11464 if (Op.isImm()) 11465 AM.IndexReg = Op.getImm(); 11466 Op = MI->getOperand(3); 11467 if (Op.isGlobal()) { 11468 AM.GV = Op.getGlobal(); 11469 } else { 11470 AM.Disp = Op.getImm(); 11471 } 11472 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 11473 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 11474 11475 // Reload the original control word now. 11476 addFrameReference(BuildMI(*BB, MI, DL, 11477 TII->get(X86::FLDCW16m)), CWFrameIdx); 11478 11479 MI->eraseFromParent(); // The pseudo instruction is gone now. 11480 return BB; 11481 } 11482 // String/text processing lowering. 11483 case X86::PCMPISTRM128REG: 11484 case X86::VPCMPISTRM128REG: 11485 return EmitPCMP(MI, BB, 3, false /* in-mem */); 11486 case X86::PCMPISTRM128MEM: 11487 case X86::VPCMPISTRM128MEM: 11488 return EmitPCMP(MI, BB, 3, true /* in-mem */); 11489 case X86::PCMPESTRM128REG: 11490 case X86::VPCMPESTRM128REG: 11491 return EmitPCMP(MI, BB, 5, false /* in mem */); 11492 case X86::PCMPESTRM128MEM: 11493 case X86::VPCMPESTRM128MEM: 11494 return EmitPCMP(MI, BB, 5, true /* in mem */); 11495 11496 // Thread synchronization. 11497 case X86::MONITOR: 11498 return EmitMonitor(MI, BB); 11499 case X86::MWAIT: 11500 return EmitMwait(MI, BB); 11501 11502 // Atomic Lowering. 11503 case X86::ATOMAND32: 11504 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11505 X86::AND32ri, X86::MOV32rm, 11506 X86::LCMPXCHG32, 11507 X86::NOT32r, X86::EAX, 11508 X86::GR32RegisterClass); 11509 case X86::ATOMOR32: 11510 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 11511 X86::OR32ri, X86::MOV32rm, 11512 X86::LCMPXCHG32, 11513 X86::NOT32r, X86::EAX, 11514 X86::GR32RegisterClass); 11515 case X86::ATOMXOR32: 11516 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 11517 X86::XOR32ri, X86::MOV32rm, 11518 X86::LCMPXCHG32, 11519 X86::NOT32r, X86::EAX, 11520 X86::GR32RegisterClass); 11521 case X86::ATOMNAND32: 11522 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11523 X86::AND32ri, X86::MOV32rm, 11524 X86::LCMPXCHG32, 11525 X86::NOT32r, X86::EAX, 11526 X86::GR32RegisterClass, true); 11527 case X86::ATOMMIN32: 11528 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 11529 case X86::ATOMMAX32: 11530 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 11531 case X86::ATOMUMIN32: 11532 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 11533 case X86::ATOMUMAX32: 11534 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 11535 11536 case X86::ATOMAND16: 11537 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11538 X86::AND16ri, X86::MOV16rm, 11539 X86::LCMPXCHG16, 11540 X86::NOT16r, X86::AX, 11541 X86::GR16RegisterClass); 11542 case X86::ATOMOR16: 11543 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 11544 X86::OR16ri, X86::MOV16rm, 11545 X86::LCMPXCHG16, 11546 X86::NOT16r, X86::AX, 11547 X86::GR16RegisterClass); 11548 case X86::ATOMXOR16: 11549 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 11550 X86::XOR16ri, X86::MOV16rm, 11551 X86::LCMPXCHG16, 11552 X86::NOT16r, X86::AX, 11553 X86::GR16RegisterClass); 11554 case X86::ATOMNAND16: 11555 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11556 X86::AND16ri, X86::MOV16rm, 11557 X86::LCMPXCHG16, 11558 X86::NOT16r, X86::AX, 11559 X86::GR16RegisterClass, true); 11560 case X86::ATOMMIN16: 11561 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 11562 case X86::ATOMMAX16: 11563 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 11564 case X86::ATOMUMIN16: 11565 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 11566 case X86::ATOMUMAX16: 11567 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 11568 11569 case X86::ATOMAND8: 11570 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11571 X86::AND8ri, X86::MOV8rm, 11572 X86::LCMPXCHG8, 11573 X86::NOT8r, X86::AL, 11574 X86::GR8RegisterClass); 11575 case X86::ATOMOR8: 11576 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 11577 X86::OR8ri, X86::MOV8rm, 11578 X86::LCMPXCHG8, 11579 X86::NOT8r, X86::AL, 11580 X86::GR8RegisterClass); 11581 case X86::ATOMXOR8: 11582 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 11583 X86::XOR8ri, X86::MOV8rm, 11584 X86::LCMPXCHG8, 11585 X86::NOT8r, X86::AL, 11586 X86::GR8RegisterClass); 11587 case X86::ATOMNAND8: 11588 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11589 X86::AND8ri, X86::MOV8rm, 11590 X86::LCMPXCHG8, 11591 X86::NOT8r, X86::AL, 11592 X86::GR8RegisterClass, true); 11593 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 11594 // This group is for 64-bit host. 11595 case X86::ATOMAND64: 11596 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11597 X86::AND64ri32, X86::MOV64rm, 11598 X86::LCMPXCHG64, 11599 X86::NOT64r, X86::RAX, 11600 X86::GR64RegisterClass); 11601 case X86::ATOMOR64: 11602 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 11603 X86::OR64ri32, X86::MOV64rm, 11604 X86::LCMPXCHG64, 11605 X86::NOT64r, X86::RAX, 11606 X86::GR64RegisterClass); 11607 case X86::ATOMXOR64: 11608 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 11609 X86::XOR64ri32, X86::MOV64rm, 11610 X86::LCMPXCHG64, 11611 X86::NOT64r, X86::RAX, 11612 X86::GR64RegisterClass); 11613 case X86::ATOMNAND64: 11614 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11615 X86::AND64ri32, X86::MOV64rm, 11616 X86::LCMPXCHG64, 11617 X86::NOT64r, X86::RAX, 11618 X86::GR64RegisterClass, true); 11619 case X86::ATOMMIN64: 11620 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 11621 case X86::ATOMMAX64: 11622 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 11623 case X86::ATOMUMIN64: 11624 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 11625 case X86::ATOMUMAX64: 11626 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 11627 11628 // This group does 64-bit operations on a 32-bit host. 11629 case X86::ATOMAND6432: 11630 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11631 X86::AND32rr, X86::AND32rr, 11632 X86::AND32ri, X86::AND32ri, 11633 false); 11634 case X86::ATOMOR6432: 11635 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11636 X86::OR32rr, X86::OR32rr, 11637 X86::OR32ri, X86::OR32ri, 11638 false); 11639 case X86::ATOMXOR6432: 11640 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11641 X86::XOR32rr, X86::XOR32rr, 11642 X86::XOR32ri, X86::XOR32ri, 11643 false); 11644 case X86::ATOMNAND6432: 11645 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11646 X86::AND32rr, X86::AND32rr, 11647 X86::AND32ri, X86::AND32ri, 11648 true); 11649 case X86::ATOMADD6432: 11650 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11651 X86::ADD32rr, X86::ADC32rr, 11652 X86::ADD32ri, X86::ADC32ri, 11653 false); 11654 case X86::ATOMSUB6432: 11655 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11656 X86::SUB32rr, X86::SBB32rr, 11657 X86::SUB32ri, X86::SBB32ri, 11658 false); 11659 case X86::ATOMSWAP6432: 11660 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11661 X86::MOV32rr, X86::MOV32rr, 11662 X86::MOV32ri, X86::MOV32ri, 11663 false); 11664 case X86::VASTART_SAVE_XMM_REGS: 11665 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 11666 11667 case X86::VAARG_64: 11668 return EmitVAARG64WithCustomInserter(MI, BB); 11669 } 11670} 11671 11672//===----------------------------------------------------------------------===// 11673// X86 Optimization Hooks 11674//===----------------------------------------------------------------------===// 11675 11676void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 11677 const APInt &Mask, 11678 APInt &KnownZero, 11679 APInt &KnownOne, 11680 const SelectionDAG &DAG, 11681 unsigned Depth) const { 11682 unsigned Opc = Op.getOpcode(); 11683 assert((Opc >= ISD::BUILTIN_OP_END || 11684 Opc == ISD::INTRINSIC_WO_CHAIN || 11685 Opc == ISD::INTRINSIC_W_CHAIN || 11686 Opc == ISD::INTRINSIC_VOID) && 11687 "Should use MaskedValueIsZero if you don't know whether Op" 11688 " is a target node!"); 11689 11690 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 11691 switch (Opc) { 11692 default: break; 11693 case X86ISD::ADD: 11694 case X86ISD::SUB: 11695 case X86ISD::ADC: 11696 case X86ISD::SBB: 11697 case X86ISD::SMUL: 11698 case X86ISD::UMUL: 11699 case X86ISD::INC: 11700 case X86ISD::DEC: 11701 case X86ISD::OR: 11702 case X86ISD::XOR: 11703 case X86ISD::AND: 11704 // These nodes' second result is a boolean. 11705 if (Op.getResNo() == 0) 11706 break; 11707 // Fallthrough 11708 case X86ISD::SETCC: 11709 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 11710 Mask.getBitWidth() - 1); 11711 break; 11712 } 11713} 11714 11715unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 11716 unsigned Depth) const { 11717 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 11718 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 11719 return Op.getValueType().getScalarType().getSizeInBits(); 11720 11721 // Fallback case. 11722 return 1; 11723} 11724 11725/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 11726/// node is a GlobalAddress + offset. 11727bool X86TargetLowering::isGAPlusOffset(SDNode *N, 11728 const GlobalValue* &GA, 11729 int64_t &Offset) const { 11730 if (N->getOpcode() == X86ISD::Wrapper) { 11731 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 11732 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 11733 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 11734 return true; 11735 } 11736 } 11737 return TargetLowering::isGAPlusOffset(N, GA, Offset); 11738} 11739 11740/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 11741/// same as extracting the high 128-bit part of 256-bit vector and then 11742/// inserting the result into the low part of a new 256-bit vector 11743static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 11744 EVT VT = SVOp->getValueType(0); 11745 int NumElems = VT.getVectorNumElements(); 11746 11747 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 11748 for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) 11749 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 11750 SVOp->getMaskElt(j) >= 0) 11751 return false; 11752 11753 return true; 11754} 11755 11756/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 11757/// same as extracting the low 128-bit part of 256-bit vector and then 11758/// inserting the result into the high part of a new 256-bit vector 11759static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 11760 EVT VT = SVOp->getValueType(0); 11761 int NumElems = VT.getVectorNumElements(); 11762 11763 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 11764 for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) 11765 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 11766 SVOp->getMaskElt(j) >= 0) 11767 return false; 11768 11769 return true; 11770} 11771 11772/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 11773static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 11774 TargetLowering::DAGCombinerInfo &DCI) { 11775 DebugLoc dl = N->getDebugLoc(); 11776 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 11777 SDValue V1 = SVOp->getOperand(0); 11778 SDValue V2 = SVOp->getOperand(1); 11779 EVT VT = SVOp->getValueType(0); 11780 int NumElems = VT.getVectorNumElements(); 11781 11782 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 11783 V2.getOpcode() == ISD::CONCAT_VECTORS) { 11784 // 11785 // 0,0,0,... 11786 // | 11787 // V UNDEF BUILD_VECTOR UNDEF 11788 // \ / \ / 11789 // CONCAT_VECTOR CONCAT_VECTOR 11790 // \ / 11791 // \ / 11792 // RESULT: V + zero extended 11793 // 11794 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 11795 V2.getOperand(1).getOpcode() != ISD::UNDEF || 11796 V1.getOperand(1).getOpcode() != ISD::UNDEF) 11797 return SDValue(); 11798 11799 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 11800 return SDValue(); 11801 11802 // To match the shuffle mask, the first half of the mask should 11803 // be exactly the first vector, and all the rest a splat with the 11804 // first element of the second one. 11805 for (int i = 0; i < NumElems/2; ++i) 11806 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 11807 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 11808 return SDValue(); 11809 11810 // Emit a zeroed vector and insert the desired subvector on its 11811 // first half. 11812 SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); 11813 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 11814 DAG.getConstant(0, MVT::i32), DAG, dl); 11815 return DCI.CombineTo(N, InsV); 11816 } 11817 11818 //===--------------------------------------------------------------------===// 11819 // Combine some shuffles into subvector extracts and inserts: 11820 // 11821 11822 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 11823 if (isShuffleHigh128VectorInsertLow(SVOp)) { 11824 SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), 11825 DAG, dl); 11826 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 11827 V, DAG.getConstant(0, MVT::i32), DAG, dl); 11828 return DCI.CombineTo(N, InsV); 11829 } 11830 11831 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 11832 if (isShuffleLow128VectorInsertHigh(SVOp)) { 11833 SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); 11834 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 11835 V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 11836 return DCI.CombineTo(N, InsV); 11837 } 11838 11839 return SDValue(); 11840} 11841 11842/// PerformShuffleCombine - Performs several different shuffle combines. 11843static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 11844 TargetLowering::DAGCombinerInfo &DCI, 11845 const X86Subtarget *Subtarget) { 11846 DebugLoc dl = N->getDebugLoc(); 11847 EVT VT = N->getValueType(0); 11848 11849 // Don't create instructions with illegal types after legalize types has run. 11850 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11851 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 11852 return SDValue(); 11853 11854 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 11855 if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && 11856 N->getOpcode() == ISD::VECTOR_SHUFFLE) 11857 return PerformShuffleCombine256(N, DAG, DCI); 11858 11859 // Only handle 128 wide vector from here on. 11860 if (VT.getSizeInBits() != 128) 11861 return SDValue(); 11862 11863 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 11864 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 11865 // consecutive, non-overlapping, and in the right order. 11866 SmallVector<SDValue, 16> Elts; 11867 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 11868 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 11869 11870 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 11871} 11872 11873/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 11874/// generation and convert it from being a bunch of shuffles and extracts 11875/// to a simple store and scalar loads to extract the elements. 11876static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 11877 const TargetLowering &TLI) { 11878 SDValue InputVector = N->getOperand(0); 11879 11880 // Only operate on vectors of 4 elements, where the alternative shuffling 11881 // gets to be more expensive. 11882 if (InputVector.getValueType() != MVT::v4i32) 11883 return SDValue(); 11884 11885 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 11886 // single use which is a sign-extend or zero-extend, and all elements are 11887 // used. 11888 SmallVector<SDNode *, 4> Uses; 11889 unsigned ExtractedElements = 0; 11890 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 11891 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 11892 if (UI.getUse().getResNo() != InputVector.getResNo()) 11893 return SDValue(); 11894 11895 SDNode *Extract = *UI; 11896 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11897 return SDValue(); 11898 11899 if (Extract->getValueType(0) != MVT::i32) 11900 return SDValue(); 11901 if (!Extract->hasOneUse()) 11902 return SDValue(); 11903 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 11904 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 11905 return SDValue(); 11906 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 11907 return SDValue(); 11908 11909 // Record which element was extracted. 11910 ExtractedElements |= 11911 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 11912 11913 Uses.push_back(Extract); 11914 } 11915 11916 // If not all the elements were used, this may not be worthwhile. 11917 if (ExtractedElements != 15) 11918 return SDValue(); 11919 11920 // Ok, we've now decided to do the transformation. 11921 DebugLoc dl = InputVector.getDebugLoc(); 11922 11923 // Store the value to a temporary stack slot. 11924 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 11925 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 11926 MachinePointerInfo(), false, false, 0); 11927 11928 // Replace each use (extract) with a load of the appropriate element. 11929 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 11930 UE = Uses.end(); UI != UE; ++UI) { 11931 SDNode *Extract = *UI; 11932 11933 // cOMpute the element's address. 11934 SDValue Idx = Extract->getOperand(1); 11935 unsigned EltSize = 11936 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 11937 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 11938 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 11939 11940 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 11941 StackPtr, OffsetVal); 11942 11943 // Load the scalar. 11944 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 11945 ScalarAddr, MachinePointerInfo(), 11946 false, false, 0); 11947 11948 // Replace the exact with the load. 11949 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 11950 } 11951 11952 // The replacement was made in place; don't return anything. 11953 return SDValue(); 11954} 11955 11956/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 11957static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 11958 const X86Subtarget *Subtarget) { 11959 DebugLoc DL = N->getDebugLoc(); 11960 SDValue Cond = N->getOperand(0); 11961 // Get the LHS/RHS of the select. 11962 SDValue LHS = N->getOperand(1); 11963 SDValue RHS = N->getOperand(2); 11964 11965 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 11966 // instructions match the semantics of the common C idiom x<y?x:y but not 11967 // x<=y?x:y, because of how they handle negative zero (which can be 11968 // ignored in unsafe-math mode). 11969 if (Subtarget->hasSSE2() && 11970 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 11971 Cond.getOpcode() == ISD::SETCC) { 11972 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 11973 11974 unsigned Opcode = 0; 11975 // Check for x CC y ? x : y. 11976 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 11977 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 11978 switch (CC) { 11979 default: break; 11980 case ISD::SETULT: 11981 // Converting this to a min would handle NaNs incorrectly, and swapping 11982 // the operands would cause it to handle comparisons between positive 11983 // and negative zero incorrectly. 11984 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11985 if (!UnsafeFPMath && 11986 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11987 break; 11988 std::swap(LHS, RHS); 11989 } 11990 Opcode = X86ISD::FMIN; 11991 break; 11992 case ISD::SETOLE: 11993 // Converting this to a min would handle comparisons between positive 11994 // and negative zero incorrectly. 11995 if (!UnsafeFPMath && 11996 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11997 break; 11998 Opcode = X86ISD::FMIN; 11999 break; 12000 case ISD::SETULE: 12001 // Converting this to a min would handle both negative zeros and NaNs 12002 // incorrectly, but we can swap the operands to fix both. 12003 std::swap(LHS, RHS); 12004 case ISD::SETOLT: 12005 case ISD::SETLT: 12006 case ISD::SETLE: 12007 Opcode = X86ISD::FMIN; 12008 break; 12009 12010 case ISD::SETOGE: 12011 // Converting this to a max would handle comparisons between positive 12012 // and negative zero incorrectly. 12013 if (!UnsafeFPMath && 12014 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 12015 break; 12016 Opcode = X86ISD::FMAX; 12017 break; 12018 case ISD::SETUGT: 12019 // Converting this to a max would handle NaNs incorrectly, and swapping 12020 // the operands would cause it to handle comparisons between positive 12021 // and negative zero incorrectly. 12022 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 12023 if (!UnsafeFPMath && 12024 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 12025 break; 12026 std::swap(LHS, RHS); 12027 } 12028 Opcode = X86ISD::FMAX; 12029 break; 12030 case ISD::SETUGE: 12031 // Converting this to a max would handle both negative zeros and NaNs 12032 // incorrectly, but we can swap the operands to fix both. 12033 std::swap(LHS, RHS); 12034 case ISD::SETOGT: 12035 case ISD::SETGT: 12036 case ISD::SETGE: 12037 Opcode = X86ISD::FMAX; 12038 break; 12039 } 12040 // Check for x CC y ? y : x -- a min/max with reversed arms. 12041 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 12042 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 12043 switch (CC) { 12044 default: break; 12045 case ISD::SETOGE: 12046 // Converting this to a min would handle comparisons between positive 12047 // and negative zero incorrectly, and swapping the operands would 12048 // cause it to handle NaNs incorrectly. 12049 if (!UnsafeFPMath && 12050 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 12051 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12052 break; 12053 std::swap(LHS, RHS); 12054 } 12055 Opcode = X86ISD::FMIN; 12056 break; 12057 case ISD::SETUGT: 12058 // Converting this to a min would handle NaNs incorrectly. 12059 if (!UnsafeFPMath && 12060 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 12061 break; 12062 Opcode = X86ISD::FMIN; 12063 break; 12064 case ISD::SETUGE: 12065 // Converting this to a min would handle both negative zeros and NaNs 12066 // incorrectly, but we can swap the operands to fix both. 12067 std::swap(LHS, RHS); 12068 case ISD::SETOGT: 12069 case ISD::SETGT: 12070 case ISD::SETGE: 12071 Opcode = X86ISD::FMIN; 12072 break; 12073 12074 case ISD::SETULT: 12075 // Converting this to a max would handle NaNs incorrectly. 12076 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12077 break; 12078 Opcode = X86ISD::FMAX; 12079 break; 12080 case ISD::SETOLE: 12081 // Converting this to a max would handle comparisons between positive 12082 // and negative zero incorrectly, and swapping the operands would 12083 // cause it to handle NaNs incorrectly. 12084 if (!UnsafeFPMath && 12085 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 12086 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12087 break; 12088 std::swap(LHS, RHS); 12089 } 12090 Opcode = X86ISD::FMAX; 12091 break; 12092 case ISD::SETULE: 12093 // Converting this to a max would handle both negative zeros and NaNs 12094 // incorrectly, but we can swap the operands to fix both. 12095 std::swap(LHS, RHS); 12096 case ISD::SETOLT: 12097 case ISD::SETLT: 12098 case ISD::SETLE: 12099 Opcode = X86ISD::FMAX; 12100 break; 12101 } 12102 } 12103 12104 if (Opcode) 12105 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 12106 } 12107 12108 // If this is a select between two integer constants, try to do some 12109 // optimizations. 12110 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 12111 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 12112 // Don't do this for crazy integer types. 12113 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 12114 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 12115 // so that TrueC (the true value) is larger than FalseC. 12116 bool NeedsCondInvert = false; 12117 12118 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 12119 // Efficiently invertible. 12120 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 12121 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 12122 isa<ConstantSDNode>(Cond.getOperand(1))))) { 12123 NeedsCondInvert = true; 12124 std::swap(TrueC, FalseC); 12125 } 12126 12127 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 12128 if (FalseC->getAPIntValue() == 0 && 12129 TrueC->getAPIntValue().isPowerOf2()) { 12130 if (NeedsCondInvert) // Invert the condition if needed. 12131 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12132 DAG.getConstant(1, Cond.getValueType())); 12133 12134 // Zero extend the condition if needed. 12135 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 12136 12137 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 12138 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 12139 DAG.getConstant(ShAmt, MVT::i8)); 12140 } 12141 12142 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 12143 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 12144 if (NeedsCondInvert) // Invert the condition if needed. 12145 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12146 DAG.getConstant(1, Cond.getValueType())); 12147 12148 // Zero extend the condition if needed. 12149 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 12150 FalseC->getValueType(0), Cond); 12151 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12152 SDValue(FalseC, 0)); 12153 } 12154 12155 // Optimize cases that will turn into an LEA instruction. This requires 12156 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 12157 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 12158 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 12159 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 12160 12161 bool isFastMultiplier = false; 12162 if (Diff < 10) { 12163 switch ((unsigned char)Diff) { 12164 default: break; 12165 case 1: // result = add base, cond 12166 case 2: // result = lea base( , cond*2) 12167 case 3: // result = lea base(cond, cond*2) 12168 case 4: // result = lea base( , cond*4) 12169 case 5: // result = lea base(cond, cond*4) 12170 case 8: // result = lea base( , cond*8) 12171 case 9: // result = lea base(cond, cond*8) 12172 isFastMultiplier = true; 12173 break; 12174 } 12175 } 12176 12177 if (isFastMultiplier) { 12178 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 12179 if (NeedsCondInvert) // Invert the condition if needed. 12180 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 12181 DAG.getConstant(1, Cond.getValueType())); 12182 12183 // Zero extend the condition if needed. 12184 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 12185 Cond); 12186 // Scale the condition by the difference. 12187 if (Diff != 1) 12188 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 12189 DAG.getConstant(Diff, Cond.getValueType())); 12190 12191 // Add the base if non-zero. 12192 if (FalseC->getAPIntValue() != 0) 12193 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12194 SDValue(FalseC, 0)); 12195 return Cond; 12196 } 12197 } 12198 } 12199 } 12200 12201 return SDValue(); 12202} 12203 12204/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 12205static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 12206 TargetLowering::DAGCombinerInfo &DCI) { 12207 DebugLoc DL = N->getDebugLoc(); 12208 12209 // If the flag operand isn't dead, don't touch this CMOV. 12210 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 12211 return SDValue(); 12212 12213 SDValue FalseOp = N->getOperand(0); 12214 SDValue TrueOp = N->getOperand(1); 12215 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 12216 SDValue Cond = N->getOperand(3); 12217 if (CC == X86::COND_E || CC == X86::COND_NE) { 12218 switch (Cond.getOpcode()) { 12219 default: break; 12220 case X86ISD::BSR: 12221 case X86ISD::BSF: 12222 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 12223 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 12224 return (CC == X86::COND_E) ? FalseOp : TrueOp; 12225 } 12226 } 12227 12228 // If this is a select between two integer constants, try to do some 12229 // optimizations. Note that the operands are ordered the opposite of SELECT 12230 // operands. 12231 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 12232 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 12233 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 12234 // larger than FalseC (the false value). 12235 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 12236 CC = X86::GetOppositeBranchCondition(CC); 12237 std::swap(TrueC, FalseC); 12238 } 12239 12240 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 12241 // This is efficient for any integer data type (including i8/i16) and 12242 // shift amount. 12243 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 12244 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12245 DAG.getConstant(CC, MVT::i8), Cond); 12246 12247 // Zero extend the condition if needed. 12248 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 12249 12250 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 12251 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 12252 DAG.getConstant(ShAmt, MVT::i8)); 12253 if (N->getNumValues() == 2) // Dead flag value? 12254 return DCI.CombineTo(N, Cond, SDValue()); 12255 return Cond; 12256 } 12257 12258 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 12259 // for any integer data type, including i8/i16. 12260 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 12261 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12262 DAG.getConstant(CC, MVT::i8), Cond); 12263 12264 // Zero extend the condition if needed. 12265 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 12266 FalseC->getValueType(0), Cond); 12267 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12268 SDValue(FalseC, 0)); 12269 12270 if (N->getNumValues() == 2) // Dead flag value? 12271 return DCI.CombineTo(N, Cond, SDValue()); 12272 return Cond; 12273 } 12274 12275 // Optimize cases that will turn into an LEA instruction. This requires 12276 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 12277 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 12278 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 12279 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 12280 12281 bool isFastMultiplier = false; 12282 if (Diff < 10) { 12283 switch ((unsigned char)Diff) { 12284 default: break; 12285 case 1: // result = add base, cond 12286 case 2: // result = lea base( , cond*2) 12287 case 3: // result = lea base(cond, cond*2) 12288 case 4: // result = lea base( , cond*4) 12289 case 5: // result = lea base(cond, cond*4) 12290 case 8: // result = lea base( , cond*8) 12291 case 9: // result = lea base(cond, cond*8) 12292 isFastMultiplier = true; 12293 break; 12294 } 12295 } 12296 12297 if (isFastMultiplier) { 12298 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 12299 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 12300 DAG.getConstant(CC, MVT::i8), Cond); 12301 // Zero extend the condition if needed. 12302 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 12303 Cond); 12304 // Scale the condition by the difference. 12305 if (Diff != 1) 12306 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 12307 DAG.getConstant(Diff, Cond.getValueType())); 12308 12309 // Add the base if non-zero. 12310 if (FalseC->getAPIntValue() != 0) 12311 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 12312 SDValue(FalseC, 0)); 12313 if (N->getNumValues() == 2) // Dead flag value? 12314 return DCI.CombineTo(N, Cond, SDValue()); 12315 return Cond; 12316 } 12317 } 12318 } 12319 } 12320 return SDValue(); 12321} 12322 12323 12324/// PerformMulCombine - Optimize a single multiply with constant into two 12325/// in order to implement it with two cheaper instructions, e.g. 12326/// LEA + SHL, LEA + LEA. 12327static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 12328 TargetLowering::DAGCombinerInfo &DCI) { 12329 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12330 return SDValue(); 12331 12332 EVT VT = N->getValueType(0); 12333 if (VT != MVT::i64) 12334 return SDValue(); 12335 12336 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12337 if (!C) 12338 return SDValue(); 12339 uint64_t MulAmt = C->getZExtValue(); 12340 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 12341 return SDValue(); 12342 12343 uint64_t MulAmt1 = 0; 12344 uint64_t MulAmt2 = 0; 12345 if ((MulAmt % 9) == 0) { 12346 MulAmt1 = 9; 12347 MulAmt2 = MulAmt / 9; 12348 } else if ((MulAmt % 5) == 0) { 12349 MulAmt1 = 5; 12350 MulAmt2 = MulAmt / 5; 12351 } else if ((MulAmt % 3) == 0) { 12352 MulAmt1 = 3; 12353 MulAmt2 = MulAmt / 3; 12354 } 12355 if (MulAmt2 && 12356 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 12357 DebugLoc DL = N->getDebugLoc(); 12358 12359 if (isPowerOf2_64(MulAmt2) && 12360 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 12361 // If second multiplifer is pow2, issue it first. We want the multiply by 12362 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 12363 // is an add. 12364 std::swap(MulAmt1, MulAmt2); 12365 12366 SDValue NewMul; 12367 if (isPowerOf2_64(MulAmt1)) 12368 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 12369 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 12370 else 12371 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 12372 DAG.getConstant(MulAmt1, VT)); 12373 12374 if (isPowerOf2_64(MulAmt2)) 12375 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 12376 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 12377 else 12378 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 12379 DAG.getConstant(MulAmt2, VT)); 12380 12381 // Do not add new nodes to DAG combiner worklist. 12382 DCI.CombineTo(N, NewMul, false); 12383 } 12384 return SDValue(); 12385} 12386 12387static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 12388 SDValue N0 = N->getOperand(0); 12389 SDValue N1 = N->getOperand(1); 12390 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12391 EVT VT = N0.getValueType(); 12392 12393 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 12394 // since the result of setcc_c is all zero's or all ones. 12395 if (N1C && N0.getOpcode() == ISD::AND && 12396 N0.getOperand(1).getOpcode() == ISD::Constant) { 12397 SDValue N00 = N0.getOperand(0); 12398 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 12399 ((N00.getOpcode() == ISD::ANY_EXTEND || 12400 N00.getOpcode() == ISD::ZERO_EXTEND) && 12401 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 12402 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 12403 APInt ShAmt = N1C->getAPIntValue(); 12404 Mask = Mask.shl(ShAmt); 12405 if (Mask != 0) 12406 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 12407 N00, DAG.getConstant(Mask, VT)); 12408 } 12409 } 12410 12411 return SDValue(); 12412} 12413 12414/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 12415/// when possible. 12416static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 12417 const X86Subtarget *Subtarget) { 12418 EVT VT = N->getValueType(0); 12419 if (!VT.isVector() && VT.isInteger() && 12420 N->getOpcode() == ISD::SHL) 12421 return PerformSHLCombine(N, DAG); 12422 12423 // On X86 with SSE2 support, we can transform this to a vector shift if 12424 // all elements are shifted by the same amount. We can't do this in legalize 12425 // because the a constant vector is typically transformed to a constant pool 12426 // so we have no knowledge of the shift amount. 12427 if (!(Subtarget->hasSSE2() || Subtarget->hasAVX())) 12428 return SDValue(); 12429 12430 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 12431 return SDValue(); 12432 12433 SDValue ShAmtOp = N->getOperand(1); 12434 EVT EltVT = VT.getVectorElementType(); 12435 DebugLoc DL = N->getDebugLoc(); 12436 SDValue BaseShAmt = SDValue(); 12437 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 12438 unsigned NumElts = VT.getVectorNumElements(); 12439 unsigned i = 0; 12440 for (; i != NumElts; ++i) { 12441 SDValue Arg = ShAmtOp.getOperand(i); 12442 if (Arg.getOpcode() == ISD::UNDEF) continue; 12443 BaseShAmt = Arg; 12444 break; 12445 } 12446 for (; i != NumElts; ++i) { 12447 SDValue Arg = ShAmtOp.getOperand(i); 12448 if (Arg.getOpcode() == ISD::UNDEF) continue; 12449 if (Arg != BaseShAmt) { 12450 return SDValue(); 12451 } 12452 } 12453 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 12454 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 12455 SDValue InVec = ShAmtOp.getOperand(0); 12456 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12457 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12458 unsigned i = 0; 12459 for (; i != NumElts; ++i) { 12460 SDValue Arg = InVec.getOperand(i); 12461 if (Arg.getOpcode() == ISD::UNDEF) continue; 12462 BaseShAmt = Arg; 12463 break; 12464 } 12465 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12466 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12467 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 12468 if (C->getZExtValue() == SplatIdx) 12469 BaseShAmt = InVec.getOperand(1); 12470 } 12471 } 12472 if (BaseShAmt.getNode() == 0) 12473 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 12474 DAG.getIntPtrConstant(0)); 12475 } else 12476 return SDValue(); 12477 12478 // The shift amount is an i32. 12479 if (EltVT.bitsGT(MVT::i32)) 12480 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 12481 else if (EltVT.bitsLT(MVT::i32)) 12482 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 12483 12484 // The shift amount is identical so we can do a vector shift. 12485 SDValue ValOp = N->getOperand(0); 12486 switch (N->getOpcode()) { 12487 default: 12488 llvm_unreachable("Unknown shift opcode!"); 12489 break; 12490 case ISD::SHL: 12491 if (VT == MVT::v2i64) 12492 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12493 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 12494 ValOp, BaseShAmt); 12495 if (VT == MVT::v4i32) 12496 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12497 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 12498 ValOp, BaseShAmt); 12499 if (VT == MVT::v8i16) 12500 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12501 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 12502 ValOp, BaseShAmt); 12503 break; 12504 case ISD::SRA: 12505 if (VT == MVT::v4i32) 12506 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12507 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 12508 ValOp, BaseShAmt); 12509 if (VT == MVT::v8i16) 12510 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12511 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 12512 ValOp, BaseShAmt); 12513 break; 12514 case ISD::SRL: 12515 if (VT == MVT::v2i64) 12516 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12517 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 12518 ValOp, BaseShAmt); 12519 if (VT == MVT::v4i32) 12520 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12521 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 12522 ValOp, BaseShAmt); 12523 if (VT == MVT::v8i16) 12524 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12525 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 12526 ValOp, BaseShAmt); 12527 break; 12528 } 12529 return SDValue(); 12530} 12531 12532 12533// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 12534// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 12535// and friends. Likewise for OR -> CMPNEQSS. 12536static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 12537 TargetLowering::DAGCombinerInfo &DCI, 12538 const X86Subtarget *Subtarget) { 12539 unsigned opcode; 12540 12541 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 12542 // we're requiring SSE2 for both. 12543 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 12544 SDValue N0 = N->getOperand(0); 12545 SDValue N1 = N->getOperand(1); 12546 SDValue CMP0 = N0->getOperand(1); 12547 SDValue CMP1 = N1->getOperand(1); 12548 DebugLoc DL = N->getDebugLoc(); 12549 12550 // The SETCCs should both refer to the same CMP. 12551 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 12552 return SDValue(); 12553 12554 SDValue CMP00 = CMP0->getOperand(0); 12555 SDValue CMP01 = CMP0->getOperand(1); 12556 EVT VT = CMP00.getValueType(); 12557 12558 if (VT == MVT::f32 || VT == MVT::f64) { 12559 bool ExpectingFlags = false; 12560 // Check for any users that want flags: 12561 for (SDNode::use_iterator UI = N->use_begin(), 12562 UE = N->use_end(); 12563 !ExpectingFlags && UI != UE; ++UI) 12564 switch (UI->getOpcode()) { 12565 default: 12566 case ISD::BR_CC: 12567 case ISD::BRCOND: 12568 case ISD::SELECT: 12569 ExpectingFlags = true; 12570 break; 12571 case ISD::CopyToReg: 12572 case ISD::SIGN_EXTEND: 12573 case ISD::ZERO_EXTEND: 12574 case ISD::ANY_EXTEND: 12575 break; 12576 } 12577 12578 if (!ExpectingFlags) { 12579 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 12580 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 12581 12582 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 12583 X86::CondCode tmp = cc0; 12584 cc0 = cc1; 12585 cc1 = tmp; 12586 } 12587 12588 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 12589 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 12590 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 12591 X86ISD::NodeType NTOperator = is64BitFP ? 12592 X86ISD::FSETCCsd : X86ISD::FSETCCss; 12593 // FIXME: need symbolic constants for these magic numbers. 12594 // See X86ATTInstPrinter.cpp:printSSECC(). 12595 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 12596 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 12597 DAG.getConstant(x86cc, MVT::i8)); 12598 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 12599 OnesOrZeroesF); 12600 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 12601 DAG.getConstant(1, MVT::i32)); 12602 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 12603 return OneBitOfTruth; 12604 } 12605 } 12606 } 12607 } 12608 return SDValue(); 12609} 12610 12611/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 12612/// so it can be folded inside ANDNP. 12613static bool CanFoldXORWithAllOnes(const SDNode *N) { 12614 EVT VT = N->getValueType(0); 12615 12616 // Match direct AllOnes for 128 and 256-bit vectors 12617 if (ISD::isBuildVectorAllOnes(N)) 12618 return true; 12619 12620 // Look through a bit convert. 12621 if (N->getOpcode() == ISD::BITCAST) 12622 N = N->getOperand(0).getNode(); 12623 12624 // Sometimes the operand may come from a insert_subvector building a 256-bit 12625 // allones vector 12626 if (VT.getSizeInBits() == 256 && 12627 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 12628 SDValue V1 = N->getOperand(0); 12629 SDValue V2 = N->getOperand(1); 12630 12631 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 12632 V1.getOperand(0).getOpcode() == ISD::UNDEF && 12633 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 12634 ISD::isBuildVectorAllOnes(V2.getNode())) 12635 return true; 12636 } 12637 12638 return false; 12639} 12640 12641static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 12642 TargetLowering::DAGCombinerInfo &DCI, 12643 const X86Subtarget *Subtarget) { 12644 if (DCI.isBeforeLegalizeOps()) 12645 return SDValue(); 12646 12647 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12648 if (R.getNode()) 12649 return R; 12650 12651 // Want to form ANDNP nodes: 12652 // 1) In the hopes of then easily combining them with OR and AND nodes 12653 // to form PBLEND/PSIGN. 12654 // 2) To match ANDN packed intrinsics 12655 EVT VT = N->getValueType(0); 12656 if (VT != MVT::v2i64 && VT != MVT::v4i64) 12657 return SDValue(); 12658 12659 SDValue N0 = N->getOperand(0); 12660 SDValue N1 = N->getOperand(1); 12661 DebugLoc DL = N->getDebugLoc(); 12662 12663 // Check LHS for vnot 12664 if (N0.getOpcode() == ISD::XOR && 12665 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 12666 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 12667 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 12668 12669 // Check RHS for vnot 12670 if (N1.getOpcode() == ISD::XOR && 12671 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 12672 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 12673 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 12674 12675 return SDValue(); 12676} 12677 12678static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 12679 TargetLowering::DAGCombinerInfo &DCI, 12680 const X86Subtarget *Subtarget) { 12681 if (DCI.isBeforeLegalizeOps()) 12682 return SDValue(); 12683 12684 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12685 if (R.getNode()) 12686 return R; 12687 12688 EVT VT = N->getValueType(0); 12689 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 12690 return SDValue(); 12691 12692 SDValue N0 = N->getOperand(0); 12693 SDValue N1 = N->getOperand(1); 12694 12695 // look for psign/blend 12696 if (Subtarget->hasSSSE3()) { 12697 if (VT == MVT::v2i64) { 12698 // Canonicalize pandn to RHS 12699 if (N0.getOpcode() == X86ISD::ANDNP) 12700 std::swap(N0, N1); 12701 // or (and (m, x), (pandn m, y)) 12702 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 12703 SDValue Mask = N1.getOperand(0); 12704 SDValue X = N1.getOperand(1); 12705 SDValue Y; 12706 if (N0.getOperand(0) == Mask) 12707 Y = N0.getOperand(1); 12708 if (N0.getOperand(1) == Mask) 12709 Y = N0.getOperand(0); 12710 12711 // Check to see if the mask appeared in both the AND and ANDNP and 12712 if (!Y.getNode()) 12713 return SDValue(); 12714 12715 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 12716 if (Mask.getOpcode() != ISD::BITCAST || 12717 X.getOpcode() != ISD::BITCAST || 12718 Y.getOpcode() != ISD::BITCAST) 12719 return SDValue(); 12720 12721 // Look through mask bitcast. 12722 Mask = Mask.getOperand(0); 12723 EVT MaskVT = Mask.getValueType(); 12724 12725 // Validate that the Mask operand is a vector sra node. The sra node 12726 // will be an intrinsic. 12727 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 12728 return SDValue(); 12729 12730 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 12731 // there is no psrai.b 12732 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 12733 case Intrinsic::x86_sse2_psrai_w: 12734 case Intrinsic::x86_sse2_psrai_d: 12735 break; 12736 default: return SDValue(); 12737 } 12738 12739 // Check that the SRA is all signbits. 12740 SDValue SraC = Mask.getOperand(2); 12741 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 12742 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 12743 if ((SraAmt + 1) != EltBits) 12744 return SDValue(); 12745 12746 DebugLoc DL = N->getDebugLoc(); 12747 12748 // Now we know we at least have a plendvb with the mask val. See if 12749 // we can form a psignb/w/d. 12750 // psign = x.type == y.type == mask.type && y = sub(0, x); 12751 X = X.getOperand(0); 12752 Y = Y.getOperand(0); 12753 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 12754 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 12755 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 12756 unsigned Opc = 0; 12757 switch (EltBits) { 12758 case 8: Opc = X86ISD::PSIGNB; break; 12759 case 16: Opc = X86ISD::PSIGNW; break; 12760 case 32: Opc = X86ISD::PSIGND; break; 12761 default: break; 12762 } 12763 if (Opc) { 12764 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 12765 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 12766 } 12767 } 12768 // PBLENDVB only available on SSE 4.1 12769 if (!Subtarget->hasSSE41()) 12770 return SDValue(); 12771 12772 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 12773 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 12774 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 12775 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 12776 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 12777 } 12778 } 12779 } 12780 12781 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 12782 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 12783 std::swap(N0, N1); 12784 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 12785 return SDValue(); 12786 if (!N0.hasOneUse() || !N1.hasOneUse()) 12787 return SDValue(); 12788 12789 SDValue ShAmt0 = N0.getOperand(1); 12790 if (ShAmt0.getValueType() != MVT::i8) 12791 return SDValue(); 12792 SDValue ShAmt1 = N1.getOperand(1); 12793 if (ShAmt1.getValueType() != MVT::i8) 12794 return SDValue(); 12795 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 12796 ShAmt0 = ShAmt0.getOperand(0); 12797 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 12798 ShAmt1 = ShAmt1.getOperand(0); 12799 12800 DebugLoc DL = N->getDebugLoc(); 12801 unsigned Opc = X86ISD::SHLD; 12802 SDValue Op0 = N0.getOperand(0); 12803 SDValue Op1 = N1.getOperand(0); 12804 if (ShAmt0.getOpcode() == ISD::SUB) { 12805 Opc = X86ISD::SHRD; 12806 std::swap(Op0, Op1); 12807 std::swap(ShAmt0, ShAmt1); 12808 } 12809 12810 unsigned Bits = VT.getSizeInBits(); 12811 if (ShAmt1.getOpcode() == ISD::SUB) { 12812 SDValue Sum = ShAmt1.getOperand(0); 12813 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 12814 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 12815 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 12816 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 12817 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 12818 return DAG.getNode(Opc, DL, VT, 12819 Op0, Op1, 12820 DAG.getNode(ISD::TRUNCATE, DL, 12821 MVT::i8, ShAmt0)); 12822 } 12823 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 12824 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 12825 if (ShAmt0C && 12826 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 12827 return DAG.getNode(Opc, DL, VT, 12828 N0.getOperand(0), N1.getOperand(0), 12829 DAG.getNode(ISD::TRUNCATE, DL, 12830 MVT::i8, ShAmt0)); 12831 } 12832 12833 return SDValue(); 12834} 12835 12836/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 12837static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 12838 const X86Subtarget *Subtarget) { 12839 StoreSDNode *St = cast<StoreSDNode>(N); 12840 EVT VT = St->getValue().getValueType(); 12841 EVT StVT = St->getMemoryVT(); 12842 DebugLoc dl = St->getDebugLoc(); 12843 SDValue StoredVal = St->getOperand(1); 12844 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12845 12846 // If we are saving a concatination of two XMM registers, perform two stores. 12847 // This is better in Sandy Bridge cause one 256-bit mem op is done via two 12848 // 128-bit ones. If in the future the cost becomes only one memory access the 12849 // first version would be better. 12850 if (VT.getSizeInBits() == 256 && 12851 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 12852 StoredVal.getNumOperands() == 2) { 12853 12854 SDValue Value0 = StoredVal.getOperand(0); 12855 SDValue Value1 = StoredVal.getOperand(1); 12856 12857 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 12858 SDValue Ptr0 = St->getBasePtr(); 12859 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 12860 12861 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 12862 St->getPointerInfo(), St->isVolatile(), 12863 St->isNonTemporal(), St->getAlignment()); 12864 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 12865 St->getPointerInfo(), St->isVolatile(), 12866 St->isNonTemporal(), St->getAlignment()); 12867 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 12868 } 12869 12870 // Optimize trunc store (of multiple scalars) to shuffle and store. 12871 // First, pack all of the elements in one place. Next, store to memory 12872 // in fewer chunks. 12873 if (St->isTruncatingStore() && VT.isVector()) { 12874 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12875 unsigned NumElems = VT.getVectorNumElements(); 12876 assert(StVT != VT && "Cannot truncate to the same type"); 12877 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 12878 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 12879 12880 // From, To sizes and ElemCount must be pow of two 12881 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 12882 // We are going to use the original vector elt for storing. 12883 // accumulated smaller vector elements must be a multiple of bigger size. 12884 if (0 != (NumElems * ToSz) % FromSz) return SDValue(); 12885 unsigned SizeRatio = FromSz / ToSz; 12886 12887 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 12888 12889 // Create a type on which we perform the shuffle 12890 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 12891 StVT.getScalarType(), NumElems*SizeRatio); 12892 12893 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 12894 12895 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 12896 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 12897 for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; 12898 12899 // Can't shuffle using an illegal type 12900 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 12901 12902 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 12903 DAG.getUNDEF(WideVec.getValueType()), 12904 ShuffleVec.data()); 12905 // At this point all of the data is stored at the bottom of the 12906 // register. We now need to save it to mem. 12907 12908 // Find the largest store unit 12909 MVT StoreType = MVT::i8; 12910 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 12911 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 12912 MVT Tp = (MVT::SimpleValueType)tp; 12913 if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) 12914 StoreType = Tp; 12915 } 12916 12917 // Bitcast the original vector into a vector of store-size units 12918 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 12919 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 12920 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 12921 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 12922 SmallVector<SDValue, 8> Chains; 12923 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 12924 TLI.getPointerTy()); 12925 SDValue Ptr = St->getBasePtr(); 12926 12927 // Perform one or more big stores into memory. 12928 for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { 12929 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 12930 StoreType, ShuffWide, 12931 DAG.getIntPtrConstant(i)); 12932 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 12933 St->getPointerInfo(), St->isVolatile(), 12934 St->isNonTemporal(), St->getAlignment()); 12935 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 12936 Chains.push_back(Ch); 12937 } 12938 12939 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 12940 Chains.size()); 12941 } 12942 12943 12944 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 12945 // the FP state in cases where an emms may be missing. 12946 // A preferable solution to the general problem is to figure out the right 12947 // places to insert EMMS. This qualifies as a quick hack. 12948 12949 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 12950 if (VT.getSizeInBits() != 64) 12951 return SDValue(); 12952 12953 const Function *F = DAG.getMachineFunction().getFunction(); 12954 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 12955 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 12956 && Subtarget->hasSSE2(); 12957 if ((VT.isVector() || 12958 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 12959 isa<LoadSDNode>(St->getValue()) && 12960 !cast<LoadSDNode>(St->getValue())->isVolatile() && 12961 St->getChain().hasOneUse() && !St->isVolatile()) { 12962 SDNode* LdVal = St->getValue().getNode(); 12963 LoadSDNode *Ld = 0; 12964 int TokenFactorIndex = -1; 12965 SmallVector<SDValue, 8> Ops; 12966 SDNode* ChainVal = St->getChain().getNode(); 12967 // Must be a store of a load. We currently handle two cases: the load 12968 // is a direct child, and it's under an intervening TokenFactor. It is 12969 // possible to dig deeper under nested TokenFactors. 12970 if (ChainVal == LdVal) 12971 Ld = cast<LoadSDNode>(St->getChain()); 12972 else if (St->getValue().hasOneUse() && 12973 ChainVal->getOpcode() == ISD::TokenFactor) { 12974 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 12975 if (ChainVal->getOperand(i).getNode() == LdVal) { 12976 TokenFactorIndex = i; 12977 Ld = cast<LoadSDNode>(St->getValue()); 12978 } else 12979 Ops.push_back(ChainVal->getOperand(i)); 12980 } 12981 } 12982 12983 if (!Ld || !ISD::isNormalLoad(Ld)) 12984 return SDValue(); 12985 12986 // If this is not the MMX case, i.e. we are just turning i64 load/store 12987 // into f64 load/store, avoid the transformation if there are multiple 12988 // uses of the loaded value. 12989 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 12990 return SDValue(); 12991 12992 DebugLoc LdDL = Ld->getDebugLoc(); 12993 DebugLoc StDL = N->getDebugLoc(); 12994 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 12995 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 12996 // pair instead. 12997 if (Subtarget->is64Bit() || F64IsLegal) { 12998 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 12999 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 13000 Ld->getPointerInfo(), Ld->isVolatile(), 13001 Ld->isNonTemporal(), Ld->getAlignment()); 13002 SDValue NewChain = NewLd.getValue(1); 13003 if (TokenFactorIndex != -1) { 13004 Ops.push_back(NewChain); 13005 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 13006 Ops.size()); 13007 } 13008 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 13009 St->getPointerInfo(), 13010 St->isVolatile(), St->isNonTemporal(), 13011 St->getAlignment()); 13012 } 13013 13014 // Otherwise, lower to two pairs of 32-bit loads / stores. 13015 SDValue LoAddr = Ld->getBasePtr(); 13016 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 13017 DAG.getConstant(4, MVT::i32)); 13018 13019 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 13020 Ld->getPointerInfo(), 13021 Ld->isVolatile(), Ld->isNonTemporal(), 13022 Ld->getAlignment()); 13023 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 13024 Ld->getPointerInfo().getWithOffset(4), 13025 Ld->isVolatile(), Ld->isNonTemporal(), 13026 MinAlign(Ld->getAlignment(), 4)); 13027 13028 SDValue NewChain = LoLd.getValue(1); 13029 if (TokenFactorIndex != -1) { 13030 Ops.push_back(LoLd); 13031 Ops.push_back(HiLd); 13032 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 13033 Ops.size()); 13034 } 13035 13036 LoAddr = St->getBasePtr(); 13037 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 13038 DAG.getConstant(4, MVT::i32)); 13039 13040 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 13041 St->getPointerInfo(), 13042 St->isVolatile(), St->isNonTemporal(), 13043 St->getAlignment()); 13044 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 13045 St->getPointerInfo().getWithOffset(4), 13046 St->isVolatile(), 13047 St->isNonTemporal(), 13048 MinAlign(St->getAlignment(), 4)); 13049 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 13050 } 13051 return SDValue(); 13052} 13053 13054/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 13055/// X86ISD::FXOR nodes. 13056static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 13057 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 13058 // F[X]OR(0.0, x) -> x 13059 // F[X]OR(x, 0.0) -> x 13060 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 13061 if (C->getValueAPF().isPosZero()) 13062 return N->getOperand(1); 13063 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 13064 if (C->getValueAPF().isPosZero()) 13065 return N->getOperand(0); 13066 return SDValue(); 13067} 13068 13069/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 13070static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 13071 // FAND(0.0, x) -> 0.0 13072 // FAND(x, 0.0) -> 0.0 13073 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 13074 if (C->getValueAPF().isPosZero()) 13075 return N->getOperand(0); 13076 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 13077 if (C->getValueAPF().isPosZero()) 13078 return N->getOperand(1); 13079 return SDValue(); 13080} 13081 13082static SDValue PerformBTCombine(SDNode *N, 13083 SelectionDAG &DAG, 13084 TargetLowering::DAGCombinerInfo &DCI) { 13085 // BT ignores high bits in the bit index operand. 13086 SDValue Op1 = N->getOperand(1); 13087 if (Op1.hasOneUse()) { 13088 unsigned BitWidth = Op1.getValueSizeInBits(); 13089 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 13090 APInt KnownZero, KnownOne; 13091 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 13092 !DCI.isBeforeLegalizeOps()); 13093 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13094 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 13095 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 13096 DCI.CommitTargetLoweringOpt(TLO); 13097 } 13098 return SDValue(); 13099} 13100 13101static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 13102 SDValue Op = N->getOperand(0); 13103 if (Op.getOpcode() == ISD::BITCAST) 13104 Op = Op.getOperand(0); 13105 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 13106 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 13107 VT.getVectorElementType().getSizeInBits() == 13108 OpVT.getVectorElementType().getSizeInBits()) { 13109 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 13110 } 13111 return SDValue(); 13112} 13113 13114static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 13115 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 13116 // (and (i32 x86isd::setcc_carry), 1) 13117 // This eliminates the zext. This transformation is necessary because 13118 // ISD::SETCC is always legalized to i8. 13119 DebugLoc dl = N->getDebugLoc(); 13120 SDValue N0 = N->getOperand(0); 13121 EVT VT = N->getValueType(0); 13122 if (N0.getOpcode() == ISD::AND && 13123 N0.hasOneUse() && 13124 N0.getOperand(0).hasOneUse()) { 13125 SDValue N00 = N0.getOperand(0); 13126 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 13127 return SDValue(); 13128 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 13129 if (!C || C->getZExtValue() != 1) 13130 return SDValue(); 13131 return DAG.getNode(ISD::AND, dl, VT, 13132 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 13133 N00.getOperand(0), N00.getOperand(1)), 13134 DAG.getConstant(1, VT)); 13135 } 13136 13137 return SDValue(); 13138} 13139 13140// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 13141static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 13142 unsigned X86CC = N->getConstantOperandVal(0); 13143 SDValue EFLAG = N->getOperand(1); 13144 DebugLoc DL = N->getDebugLoc(); 13145 13146 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 13147 // a zext and produces an all-ones bit which is more useful than 0/1 in some 13148 // cases. 13149 if (X86CC == X86::COND_B) 13150 return DAG.getNode(ISD::AND, DL, MVT::i8, 13151 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 13152 DAG.getConstant(X86CC, MVT::i8), EFLAG), 13153 DAG.getConstant(1, MVT::i8)); 13154 13155 return SDValue(); 13156} 13157 13158static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 13159 const X86TargetLowering *XTLI) { 13160 SDValue Op0 = N->getOperand(0); 13161 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 13162 // a 32-bit target where SSE doesn't support i64->FP operations. 13163 if (Op0.getOpcode() == ISD::LOAD) { 13164 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 13165 EVT VT = Ld->getValueType(0); 13166 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 13167 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 13168 !XTLI->getSubtarget()->is64Bit() && 13169 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13170 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 13171 Ld->getChain(), Op0, DAG); 13172 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 13173 return FILDChain; 13174 } 13175 } 13176 return SDValue(); 13177} 13178 13179// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 13180static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 13181 X86TargetLowering::DAGCombinerInfo &DCI) { 13182 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 13183 // the result is either zero or one (depending on the input carry bit). 13184 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 13185 if (X86::isZeroNode(N->getOperand(0)) && 13186 X86::isZeroNode(N->getOperand(1)) && 13187 // We don't have a good way to replace an EFLAGS use, so only do this when 13188 // dead right now. 13189 SDValue(N, 1).use_empty()) { 13190 DebugLoc DL = N->getDebugLoc(); 13191 EVT VT = N->getValueType(0); 13192 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 13193 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 13194 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 13195 DAG.getConstant(X86::COND_B,MVT::i8), 13196 N->getOperand(2)), 13197 DAG.getConstant(1, VT)); 13198 return DCI.CombineTo(N, Res1, CarryOut); 13199 } 13200 13201 return SDValue(); 13202} 13203 13204// fold (add Y, (sete X, 0)) -> adc 0, Y 13205// (add Y, (setne X, 0)) -> sbb -1, Y 13206// (sub (sete X, 0), Y) -> sbb 0, Y 13207// (sub (setne X, 0), Y) -> adc -1, Y 13208static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 13209 DebugLoc DL = N->getDebugLoc(); 13210 13211 // Look through ZExts. 13212 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 13213 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 13214 return SDValue(); 13215 13216 SDValue SetCC = Ext.getOperand(0); 13217 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 13218 return SDValue(); 13219 13220 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 13221 if (CC != X86::COND_E && CC != X86::COND_NE) 13222 return SDValue(); 13223 13224 SDValue Cmp = SetCC.getOperand(1); 13225 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 13226 !X86::isZeroNode(Cmp.getOperand(1)) || 13227 !Cmp.getOperand(0).getValueType().isInteger()) 13228 return SDValue(); 13229 13230 SDValue CmpOp0 = Cmp.getOperand(0); 13231 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 13232 DAG.getConstant(1, CmpOp0.getValueType())); 13233 13234 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 13235 if (CC == X86::COND_NE) 13236 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 13237 DL, OtherVal.getValueType(), OtherVal, 13238 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 13239 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 13240 DL, OtherVal.getValueType(), OtherVal, 13241 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 13242} 13243 13244static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { 13245 SDValue Op0 = N->getOperand(0); 13246 SDValue Op1 = N->getOperand(1); 13247 13248 // X86 can't encode an immediate LHS of a sub. See if we can push the 13249 // negation into a preceding instruction. 13250 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 13251 uint64_t Op0C = C->getSExtValue(); 13252 13253 // If the RHS of the sub is a XOR with one use and a constant, invert the 13254 // immediate. Then add one to the LHS of the sub so we can turn 13255 // X-Y -> X+~Y+1, saving one register. 13256 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 13257 isa<ConstantSDNode>(Op1.getOperand(1))) { 13258 uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue(); 13259 EVT VT = Op0.getValueType(); 13260 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 13261 Op1.getOperand(0), 13262 DAG.getConstant(~XorC, VT)); 13263 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 13264 DAG.getConstant(Op0C+1, VT)); 13265 } 13266 } 13267 13268 return OptimizeConditionalInDecrement(N, DAG); 13269} 13270 13271SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 13272 DAGCombinerInfo &DCI) const { 13273 SelectionDAG &DAG = DCI.DAG; 13274 switch (N->getOpcode()) { 13275 default: break; 13276 case ISD::EXTRACT_VECTOR_ELT: 13277 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 13278 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 13279 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 13280 case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); 13281 case ISD::SUB: return PerformSubCombine(N, DAG); 13282 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 13283 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 13284 case ISD::SHL: 13285 case ISD::SRA: 13286 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 13287 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 13288 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 13289 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 13290 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 13291 case X86ISD::FXOR: 13292 case X86ISD::FOR: return PerformFORCombine(N, DAG); 13293 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 13294 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 13295 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 13296 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 13297 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 13298 case X86ISD::SHUFPS: // Handle all target specific shuffles 13299 case X86ISD::SHUFPD: 13300 case X86ISD::PALIGN: 13301 case X86ISD::PUNPCKHBW: 13302 case X86ISD::PUNPCKHWD: 13303 case X86ISD::PUNPCKHDQ: 13304 case X86ISD::PUNPCKHQDQ: 13305 case X86ISD::UNPCKHPS: 13306 case X86ISD::UNPCKHPD: 13307 case X86ISD::VUNPCKHPSY: 13308 case X86ISD::VUNPCKHPDY: 13309 case X86ISD::PUNPCKLBW: 13310 case X86ISD::PUNPCKLWD: 13311 case X86ISD::PUNPCKLDQ: 13312 case X86ISD::PUNPCKLQDQ: 13313 case X86ISD::UNPCKLPS: 13314 case X86ISD::UNPCKLPD: 13315 case X86ISD::VUNPCKLPSY: 13316 case X86ISD::VUNPCKLPDY: 13317 case X86ISD::MOVHLPS: 13318 case X86ISD::MOVLHPS: 13319 case X86ISD::PSHUFD: 13320 case X86ISD::PSHUFHW: 13321 case X86ISD::PSHUFLW: 13322 case X86ISD::MOVSS: 13323 case X86ISD::MOVSD: 13324 case X86ISD::VPERMILPS: 13325 case X86ISD::VPERMILPSY: 13326 case X86ISD::VPERMILPD: 13327 case X86ISD::VPERMILPDY: 13328 case X86ISD::VPERM2F128: 13329 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 13330 } 13331 13332 return SDValue(); 13333} 13334 13335/// isTypeDesirableForOp - Return true if the target has native support for 13336/// the specified value type and it is 'desirable' to use the type for the 13337/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 13338/// instruction encodings are longer and some i16 instructions are slow. 13339bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 13340 if (!isTypeLegal(VT)) 13341 return false; 13342 if (VT != MVT::i16) 13343 return true; 13344 13345 switch (Opc) { 13346 default: 13347 return true; 13348 case ISD::LOAD: 13349 case ISD::SIGN_EXTEND: 13350 case ISD::ZERO_EXTEND: 13351 case ISD::ANY_EXTEND: 13352 case ISD::SHL: 13353 case ISD::SRL: 13354 case ISD::SUB: 13355 case ISD::ADD: 13356 case ISD::MUL: 13357 case ISD::AND: 13358 case ISD::OR: 13359 case ISD::XOR: 13360 return false; 13361 } 13362} 13363 13364/// IsDesirableToPromoteOp - This method query the target whether it is 13365/// beneficial for dag combiner to promote the specified node. If true, it 13366/// should return the desired promotion type by reference. 13367bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 13368 EVT VT = Op.getValueType(); 13369 if (VT != MVT::i16) 13370 return false; 13371 13372 bool Promote = false; 13373 bool Commute = false; 13374 switch (Op.getOpcode()) { 13375 default: break; 13376 case ISD::LOAD: { 13377 LoadSDNode *LD = cast<LoadSDNode>(Op); 13378 // If the non-extending load has a single use and it's not live out, then it 13379 // might be folded. 13380 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 13381 Op.hasOneUse()*/) { 13382 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 13383 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 13384 // The only case where we'd want to promote LOAD (rather then it being 13385 // promoted as an operand is when it's only use is liveout. 13386 if (UI->getOpcode() != ISD::CopyToReg) 13387 return false; 13388 } 13389 } 13390 Promote = true; 13391 break; 13392 } 13393 case ISD::SIGN_EXTEND: 13394 case ISD::ZERO_EXTEND: 13395 case ISD::ANY_EXTEND: 13396 Promote = true; 13397 break; 13398 case ISD::SHL: 13399 case ISD::SRL: { 13400 SDValue N0 = Op.getOperand(0); 13401 // Look out for (store (shl (load), x)). 13402 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 13403 return false; 13404 Promote = true; 13405 break; 13406 } 13407 case ISD::ADD: 13408 case ISD::MUL: 13409 case ISD::AND: 13410 case ISD::OR: 13411 case ISD::XOR: 13412 Commute = true; 13413 // fallthrough 13414 case ISD::SUB: { 13415 SDValue N0 = Op.getOperand(0); 13416 SDValue N1 = Op.getOperand(1); 13417 if (!Commute && MayFoldLoad(N1)) 13418 return false; 13419 // Avoid disabling potential load folding opportunities. 13420 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 13421 return false; 13422 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 13423 return false; 13424 Promote = true; 13425 } 13426 } 13427 13428 PVT = MVT::i32; 13429 return Promote; 13430} 13431 13432//===----------------------------------------------------------------------===// 13433// X86 Inline Assembly Support 13434//===----------------------------------------------------------------------===// 13435 13436bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 13437 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13438 13439 std::string AsmStr = IA->getAsmString(); 13440 13441 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 13442 SmallVector<StringRef, 4> AsmPieces; 13443 SplitString(AsmStr, AsmPieces, ";\n"); 13444 13445 switch (AsmPieces.size()) { 13446 default: return false; 13447 case 1: 13448 AsmStr = AsmPieces[0]; 13449 AsmPieces.clear(); 13450 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 13451 13452 // FIXME: this should verify that we are targeting a 486 or better. If not, 13453 // we will turn this bswap into something that will be lowered to logical ops 13454 // instead of emitting the bswap asm. For now, we don't support 486 or lower 13455 // so don't worry about this. 13456 // bswap $0 13457 if (AsmPieces.size() == 2 && 13458 (AsmPieces[0] == "bswap" || 13459 AsmPieces[0] == "bswapq" || 13460 AsmPieces[0] == "bswapl") && 13461 (AsmPieces[1] == "$0" || 13462 AsmPieces[1] == "${0:q}")) { 13463 // No need to check constraints, nothing other than the equivalent of 13464 // "=r,0" would be valid here. 13465 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13466 if (!Ty || Ty->getBitWidth() % 16 != 0) 13467 return false; 13468 return IntrinsicLowering::LowerToByteSwap(CI); 13469 } 13470 // rorw $$8, ${0:w} --> llvm.bswap.i16 13471 if (CI->getType()->isIntegerTy(16) && 13472 AsmPieces.size() == 3 && 13473 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 13474 AsmPieces[1] == "$$8," && 13475 AsmPieces[2] == "${0:w}" && 13476 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13477 AsmPieces.clear(); 13478 const std::string &ConstraintsStr = IA->getConstraintString(); 13479 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13480 std::sort(AsmPieces.begin(), AsmPieces.end()); 13481 if (AsmPieces.size() == 4 && 13482 AsmPieces[0] == "~{cc}" && 13483 AsmPieces[1] == "~{dirflag}" && 13484 AsmPieces[2] == "~{flags}" && 13485 AsmPieces[3] == "~{fpsr}") { 13486 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13487 if (!Ty || Ty->getBitWidth() % 16 != 0) 13488 return false; 13489 return IntrinsicLowering::LowerToByteSwap(CI); 13490 } 13491 } 13492 break; 13493 case 3: 13494 if (CI->getType()->isIntegerTy(32) && 13495 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 13496 SmallVector<StringRef, 4> Words; 13497 SplitString(AsmPieces[0], Words, " \t,"); 13498 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13499 Words[2] == "${0:w}") { 13500 Words.clear(); 13501 SplitString(AsmPieces[1], Words, " \t,"); 13502 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 13503 Words[2] == "$0") { 13504 Words.clear(); 13505 SplitString(AsmPieces[2], Words, " \t,"); 13506 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 13507 Words[2] == "${0:w}") { 13508 AsmPieces.clear(); 13509 const std::string &ConstraintsStr = IA->getConstraintString(); 13510 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 13511 std::sort(AsmPieces.begin(), AsmPieces.end()); 13512 if (AsmPieces.size() == 4 && 13513 AsmPieces[0] == "~{cc}" && 13514 AsmPieces[1] == "~{dirflag}" && 13515 AsmPieces[2] == "~{flags}" && 13516 AsmPieces[3] == "~{fpsr}") { 13517 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13518 if (!Ty || Ty->getBitWidth() % 16 != 0) 13519 return false; 13520 return IntrinsicLowering::LowerToByteSwap(CI); 13521 } 13522 } 13523 } 13524 } 13525 } 13526 13527 if (CI->getType()->isIntegerTy(64)) { 13528 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 13529 if (Constraints.size() >= 2 && 13530 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 13531 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 13532 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 13533 SmallVector<StringRef, 4> Words; 13534 SplitString(AsmPieces[0], Words, " \t"); 13535 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 13536 Words.clear(); 13537 SplitString(AsmPieces[1], Words, " \t"); 13538 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 13539 Words.clear(); 13540 SplitString(AsmPieces[2], Words, " \t,"); 13541 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 13542 Words[2] == "%edx") { 13543 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13544 if (!Ty || Ty->getBitWidth() % 16 != 0) 13545 return false; 13546 return IntrinsicLowering::LowerToByteSwap(CI); 13547 } 13548 } 13549 } 13550 } 13551 } 13552 break; 13553 } 13554 return false; 13555} 13556 13557 13558 13559/// getConstraintType - Given a constraint letter, return the type of 13560/// constraint it is for this target. 13561X86TargetLowering::ConstraintType 13562X86TargetLowering::getConstraintType(const std::string &Constraint) const { 13563 if (Constraint.size() == 1) { 13564 switch (Constraint[0]) { 13565 case 'R': 13566 case 'q': 13567 case 'Q': 13568 case 'f': 13569 case 't': 13570 case 'u': 13571 case 'y': 13572 case 'x': 13573 case 'Y': 13574 case 'l': 13575 return C_RegisterClass; 13576 case 'a': 13577 case 'b': 13578 case 'c': 13579 case 'd': 13580 case 'S': 13581 case 'D': 13582 case 'A': 13583 return C_Register; 13584 case 'I': 13585 case 'J': 13586 case 'K': 13587 case 'L': 13588 case 'M': 13589 case 'N': 13590 case 'G': 13591 case 'C': 13592 case 'e': 13593 case 'Z': 13594 return C_Other; 13595 default: 13596 break; 13597 } 13598 } 13599 return TargetLowering::getConstraintType(Constraint); 13600} 13601 13602/// Examine constraint type and operand type and determine a weight value. 13603/// This object must already have been set up with the operand type 13604/// and the current alternative constraint selected. 13605TargetLowering::ConstraintWeight 13606 X86TargetLowering::getSingleConstraintMatchWeight( 13607 AsmOperandInfo &info, const char *constraint) const { 13608 ConstraintWeight weight = CW_Invalid; 13609 Value *CallOperandVal = info.CallOperandVal; 13610 // If we don't have a value, we can't do a match, 13611 // but allow it at the lowest weight. 13612 if (CallOperandVal == NULL) 13613 return CW_Default; 13614 Type *type = CallOperandVal->getType(); 13615 // Look at the constraint type. 13616 switch (*constraint) { 13617 default: 13618 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13619 case 'R': 13620 case 'q': 13621 case 'Q': 13622 case 'a': 13623 case 'b': 13624 case 'c': 13625 case 'd': 13626 case 'S': 13627 case 'D': 13628 case 'A': 13629 if (CallOperandVal->getType()->isIntegerTy()) 13630 weight = CW_SpecificReg; 13631 break; 13632 case 'f': 13633 case 't': 13634 case 'u': 13635 if (type->isFloatingPointTy()) 13636 weight = CW_SpecificReg; 13637 break; 13638 case 'y': 13639 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 13640 weight = CW_SpecificReg; 13641 break; 13642 case 'x': 13643 case 'Y': 13644 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 13645 weight = CW_Register; 13646 break; 13647 case 'I': 13648 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 13649 if (C->getZExtValue() <= 31) 13650 weight = CW_Constant; 13651 } 13652 break; 13653 case 'J': 13654 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13655 if (C->getZExtValue() <= 63) 13656 weight = CW_Constant; 13657 } 13658 break; 13659 case 'K': 13660 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13661 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 13662 weight = CW_Constant; 13663 } 13664 break; 13665 case 'L': 13666 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13667 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 13668 weight = CW_Constant; 13669 } 13670 break; 13671 case 'M': 13672 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13673 if (C->getZExtValue() <= 3) 13674 weight = CW_Constant; 13675 } 13676 break; 13677 case 'N': 13678 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13679 if (C->getZExtValue() <= 0xff) 13680 weight = CW_Constant; 13681 } 13682 break; 13683 case 'G': 13684 case 'C': 13685 if (dyn_cast<ConstantFP>(CallOperandVal)) { 13686 weight = CW_Constant; 13687 } 13688 break; 13689 case 'e': 13690 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13691 if ((C->getSExtValue() >= -0x80000000LL) && 13692 (C->getSExtValue() <= 0x7fffffffLL)) 13693 weight = CW_Constant; 13694 } 13695 break; 13696 case 'Z': 13697 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13698 if (C->getZExtValue() <= 0xffffffff) 13699 weight = CW_Constant; 13700 } 13701 break; 13702 } 13703 return weight; 13704} 13705 13706/// LowerXConstraint - try to replace an X constraint, which matches anything, 13707/// with another that has more specific requirements based on the type of the 13708/// corresponding operand. 13709const char *X86TargetLowering:: 13710LowerXConstraint(EVT ConstraintVT) const { 13711 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 13712 // 'f' like normal targets. 13713 if (ConstraintVT.isFloatingPoint()) { 13714 if (Subtarget->hasXMMInt()) 13715 return "Y"; 13716 if (Subtarget->hasXMM()) 13717 return "x"; 13718 } 13719 13720 return TargetLowering::LowerXConstraint(ConstraintVT); 13721} 13722 13723/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13724/// vector. If it is invalid, don't add anything to Ops. 13725void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13726 std::string &Constraint, 13727 std::vector<SDValue>&Ops, 13728 SelectionDAG &DAG) const { 13729 SDValue Result(0, 0); 13730 13731 // Only support length 1 constraints for now. 13732 if (Constraint.length() > 1) return; 13733 13734 char ConstraintLetter = Constraint[0]; 13735 switch (ConstraintLetter) { 13736 default: break; 13737 case 'I': 13738 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13739 if (C->getZExtValue() <= 31) { 13740 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13741 break; 13742 } 13743 } 13744 return; 13745 case 'J': 13746 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13747 if (C->getZExtValue() <= 63) { 13748 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13749 break; 13750 } 13751 } 13752 return; 13753 case 'K': 13754 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13755 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 13756 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13757 break; 13758 } 13759 } 13760 return; 13761 case 'N': 13762 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13763 if (C->getZExtValue() <= 255) { 13764 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13765 break; 13766 } 13767 } 13768 return; 13769 case 'e': { 13770 // 32-bit signed value 13771 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13772 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13773 C->getSExtValue())) { 13774 // Widen to 64 bits here to get it sign extended. 13775 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 13776 break; 13777 } 13778 // FIXME gcc accepts some relocatable values here too, but only in certain 13779 // memory models; it's complicated. 13780 } 13781 return; 13782 } 13783 case 'Z': { 13784 // 32-bit unsigned value 13785 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13786 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13787 C->getZExtValue())) { 13788 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13789 break; 13790 } 13791 } 13792 // FIXME gcc accepts some relocatable values here too, but only in certain 13793 // memory models; it's complicated. 13794 return; 13795 } 13796 case 'i': { 13797 // Literal immediates are always ok. 13798 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 13799 // Widen to 64 bits here to get it sign extended. 13800 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 13801 break; 13802 } 13803 13804 // In any sort of PIC mode addresses need to be computed at runtime by 13805 // adding in a register or some sort of table lookup. These can't 13806 // be used as immediates. 13807 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 13808 return; 13809 13810 // If we are in non-pic codegen mode, we allow the address of a global (with 13811 // an optional displacement) to be used with 'i'. 13812 GlobalAddressSDNode *GA = 0; 13813 int64_t Offset = 0; 13814 13815 // Match either (GA), (GA+C), (GA+C1+C2), etc. 13816 while (1) { 13817 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 13818 Offset += GA->getOffset(); 13819 break; 13820 } else if (Op.getOpcode() == ISD::ADD) { 13821 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13822 Offset += C->getZExtValue(); 13823 Op = Op.getOperand(0); 13824 continue; 13825 } 13826 } else if (Op.getOpcode() == ISD::SUB) { 13827 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13828 Offset += -C->getZExtValue(); 13829 Op = Op.getOperand(0); 13830 continue; 13831 } 13832 } 13833 13834 // Otherwise, this isn't something we can handle, reject it. 13835 return; 13836 } 13837 13838 const GlobalValue *GV = GA->getGlobal(); 13839 // If we require an extra load to get this address, as in PIC mode, we 13840 // can't accept it. 13841 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 13842 getTargetMachine()))) 13843 return; 13844 13845 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 13846 GA->getValueType(0), Offset); 13847 break; 13848 } 13849 } 13850 13851 if (Result.getNode()) { 13852 Ops.push_back(Result); 13853 return; 13854 } 13855 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13856} 13857 13858std::pair<unsigned, const TargetRegisterClass*> 13859X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 13860 EVT VT) const { 13861 // First, see if this is a constraint that directly corresponds to an LLVM 13862 // register class. 13863 if (Constraint.size() == 1) { 13864 // GCC Constraint Letters 13865 switch (Constraint[0]) { 13866 default: break; 13867 // TODO: Slight differences here in allocation order and leaving 13868 // RIP in the class. Do they matter any more here than they do 13869 // in the normal allocation? 13870 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 13871 if (Subtarget->is64Bit()) { 13872 if (VT == MVT::i32 || VT == MVT::f32) 13873 return std::make_pair(0U, X86::GR32RegisterClass); 13874 else if (VT == MVT::i16) 13875 return std::make_pair(0U, X86::GR16RegisterClass); 13876 else if (VT == MVT::i8 || VT == MVT::i1) 13877 return std::make_pair(0U, X86::GR8RegisterClass); 13878 else if (VT == MVT::i64 || VT == MVT::f64) 13879 return std::make_pair(0U, X86::GR64RegisterClass); 13880 break; 13881 } 13882 // 32-bit fallthrough 13883 case 'Q': // Q_REGS 13884 if (VT == MVT::i32 || VT == MVT::f32) 13885 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 13886 else if (VT == MVT::i16) 13887 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 13888 else if (VT == MVT::i8 || VT == MVT::i1) 13889 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 13890 else if (VT == MVT::i64) 13891 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 13892 break; 13893 case 'r': // GENERAL_REGS 13894 case 'l': // INDEX_REGS 13895 if (VT == MVT::i8 || VT == MVT::i1) 13896 return std::make_pair(0U, X86::GR8RegisterClass); 13897 if (VT == MVT::i16) 13898 return std::make_pair(0U, X86::GR16RegisterClass); 13899 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 13900 return std::make_pair(0U, X86::GR32RegisterClass); 13901 return std::make_pair(0U, X86::GR64RegisterClass); 13902 case 'R': // LEGACY_REGS 13903 if (VT == MVT::i8 || VT == MVT::i1) 13904 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 13905 if (VT == MVT::i16) 13906 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 13907 if (VT == MVT::i32 || !Subtarget->is64Bit()) 13908 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 13909 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 13910 case 'f': // FP Stack registers. 13911 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 13912 // value to the correct fpstack register class. 13913 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 13914 return std::make_pair(0U, X86::RFP32RegisterClass); 13915 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 13916 return std::make_pair(0U, X86::RFP64RegisterClass); 13917 return std::make_pair(0U, X86::RFP80RegisterClass); 13918 case 'y': // MMX_REGS if MMX allowed. 13919 if (!Subtarget->hasMMX()) break; 13920 return std::make_pair(0U, X86::VR64RegisterClass); 13921 case 'Y': // SSE_REGS if SSE2 allowed 13922 if (!Subtarget->hasXMMInt()) break; 13923 // FALL THROUGH. 13924 case 'x': // SSE_REGS if SSE1 allowed 13925 if (!Subtarget->hasXMM()) break; 13926 13927 switch (VT.getSimpleVT().SimpleTy) { 13928 default: break; 13929 // Scalar SSE types. 13930 case MVT::f32: 13931 case MVT::i32: 13932 return std::make_pair(0U, X86::FR32RegisterClass); 13933 case MVT::f64: 13934 case MVT::i64: 13935 return std::make_pair(0U, X86::FR64RegisterClass); 13936 // Vector types. 13937 case MVT::v16i8: 13938 case MVT::v8i16: 13939 case MVT::v4i32: 13940 case MVT::v2i64: 13941 case MVT::v4f32: 13942 case MVT::v2f64: 13943 return std::make_pair(0U, X86::VR128RegisterClass); 13944 } 13945 break; 13946 } 13947 } 13948 13949 // Use the default implementation in TargetLowering to convert the register 13950 // constraint into a member of a register class. 13951 std::pair<unsigned, const TargetRegisterClass*> Res; 13952 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 13953 13954 // Not found as a standard register? 13955 if (Res.second == 0) { 13956 // Map st(0) -> st(7) -> ST0 13957 if (Constraint.size() == 7 && Constraint[0] == '{' && 13958 tolower(Constraint[1]) == 's' && 13959 tolower(Constraint[2]) == 't' && 13960 Constraint[3] == '(' && 13961 (Constraint[4] >= '0' && Constraint[4] <= '7') && 13962 Constraint[5] == ')' && 13963 Constraint[6] == '}') { 13964 13965 Res.first = X86::ST0+Constraint[4]-'0'; 13966 Res.second = X86::RFP80RegisterClass; 13967 return Res; 13968 } 13969 13970 // GCC allows "st(0)" to be called just plain "st". 13971 if (StringRef("{st}").equals_lower(Constraint)) { 13972 Res.first = X86::ST0; 13973 Res.second = X86::RFP80RegisterClass; 13974 return Res; 13975 } 13976 13977 // flags -> EFLAGS 13978 if (StringRef("{flags}").equals_lower(Constraint)) { 13979 Res.first = X86::EFLAGS; 13980 Res.second = X86::CCRRegisterClass; 13981 return Res; 13982 } 13983 13984 // 'A' means EAX + EDX. 13985 if (Constraint == "A") { 13986 Res.first = X86::EAX; 13987 Res.second = X86::GR32_ADRegisterClass; 13988 return Res; 13989 } 13990 return Res; 13991 } 13992 13993 // Otherwise, check to see if this is a register class of the wrong value 13994 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 13995 // turn into {ax},{dx}. 13996 if (Res.second->hasType(VT)) 13997 return Res; // Correct type already, nothing to do. 13998 13999 // All of the single-register GCC register classes map their values onto 14000 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 14001 // really want an 8-bit or 32-bit register, map to the appropriate register 14002 // class and return the appropriate register. 14003 if (Res.second == X86::GR16RegisterClass) { 14004 if (VT == MVT::i8) { 14005 unsigned DestReg = 0; 14006 switch (Res.first) { 14007 default: break; 14008 case X86::AX: DestReg = X86::AL; break; 14009 case X86::DX: DestReg = X86::DL; break; 14010 case X86::CX: DestReg = X86::CL; break; 14011 case X86::BX: DestReg = X86::BL; break; 14012 } 14013 if (DestReg) { 14014 Res.first = DestReg; 14015 Res.second = X86::GR8RegisterClass; 14016 } 14017 } else if (VT == MVT::i32) { 14018 unsigned DestReg = 0; 14019 switch (Res.first) { 14020 default: break; 14021 case X86::AX: DestReg = X86::EAX; break; 14022 case X86::DX: DestReg = X86::EDX; break; 14023 case X86::CX: DestReg = X86::ECX; break; 14024 case X86::BX: DestReg = X86::EBX; break; 14025 case X86::SI: DestReg = X86::ESI; break; 14026 case X86::DI: DestReg = X86::EDI; break; 14027 case X86::BP: DestReg = X86::EBP; break; 14028 case X86::SP: DestReg = X86::ESP; break; 14029 } 14030 if (DestReg) { 14031 Res.first = DestReg; 14032 Res.second = X86::GR32RegisterClass; 14033 } 14034 } else if (VT == MVT::i64) { 14035 unsigned DestReg = 0; 14036 switch (Res.first) { 14037 default: break; 14038 case X86::AX: DestReg = X86::RAX; break; 14039 case X86::DX: DestReg = X86::RDX; break; 14040 case X86::CX: DestReg = X86::RCX; break; 14041 case X86::BX: DestReg = X86::RBX; break; 14042 case X86::SI: DestReg = X86::RSI; break; 14043 case X86::DI: DestReg = X86::RDI; break; 14044 case X86::BP: DestReg = X86::RBP; break; 14045 case X86::SP: DestReg = X86::RSP; break; 14046 } 14047 if (DestReg) { 14048 Res.first = DestReg; 14049 Res.second = X86::GR64RegisterClass; 14050 } 14051 } 14052 } else if (Res.second == X86::FR32RegisterClass || 14053 Res.second == X86::FR64RegisterClass || 14054 Res.second == X86::VR128RegisterClass) { 14055 // Handle references to XMM physical registers that got mapped into the 14056 // wrong class. This can happen with constraints like {xmm0} where the 14057 // target independent register mapper will just pick the first match it can 14058 // find, ignoring the required type. 14059 if (VT == MVT::f32) 14060 Res.second = X86::FR32RegisterClass; 14061 else if (VT == MVT::f64) 14062 Res.second = X86::FR64RegisterClass; 14063 else if (X86::VR128RegisterClass->hasType(VT)) 14064 Res.second = X86::VR128RegisterClass; 14065 } 14066 14067 return Res; 14068} 14069