X86ISelLowering.cpp revision a395f4df5b6d9c2feb661091ca75be2500d07cb0
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "X86.h" 18#include "X86InstrBuilder.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VariadicFunction.h" 46#include "llvm/Support/CallSite.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Target/TargetOptions.h" 51#include <bitset> 52#include <cctype> 53using namespace llvm; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57// Forward declarations. 58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 59 SDValue V2); 60 61/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 62/// sets things up to match to an AVX VEXTRACTF128 instruction or a 63/// simple subregister reference. Idx is an index in the 128 bits we 64/// want. It need not be aligned to a 128-bit bounday. That makes 65/// lowering EXTRACT_VECTOR_ELT operations easier. 66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 67 SelectionDAG &DAG, DebugLoc dl) { 68 EVT VT = Vec.getValueType(); 69 assert(VT.is256BitVector() && "Unexpected vector size!"); 70 EVT ElVT = VT.getVectorElementType(); 71 unsigned Factor = VT.getSizeInBits()/128; 72 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 73 VT.getVectorNumElements()/Factor); 74 75 // Extract from UNDEF is UNDEF. 76 if (Vec.getOpcode() == ISD::UNDEF) 77 return DAG.getUNDEF(ResultVT); 78 79 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 80 // we can match to VEXTRACTF128. 81 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 82 83 // This is the index of the first element of the 128-bit chunk 84 // we want. 85 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 86 * ElemsPerChunk); 87 88 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 89 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 90 VecIdx); 91 92 return Result; 93} 94 95/// Generate a DAG to put 128-bits into a vector > 128 bits. This 96/// sets things up to match to an AVX VINSERTF128 instruction or a 97/// simple superregister reference. Idx is an index in the 128 bits 98/// we want. It need not be aligned to a 128-bit bounday. That makes 99/// lowering INSERT_VECTOR_ELT operations easier. 100static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 101 unsigned IdxVal, SelectionDAG &DAG, 102 DebugLoc dl) { 103 // Inserting UNDEF is Result 104 if (Vec.getOpcode() == ISD::UNDEF) 105 return Result; 106 107 EVT VT = Vec.getValueType(); 108 assert(VT.is128BitVector() && "Unexpected vector size!"); 109 110 EVT ElVT = VT.getVectorElementType(); 111 EVT ResultVT = Result.getValueType(); 112 113 // Insert the relevant 128 bits. 114 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 115 116 // This is the index of the first element of the 128-bit chunk 117 // we want. 118 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 119 * ElemsPerChunk); 120 121 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 122 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 123 VecIdx); 124} 125 126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 127/// instructions. This is used because creating CONCAT_VECTOR nodes of 128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 129/// large BUILD_VECTORS. 130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 131 unsigned NumElems, SelectionDAG &DAG, 132 DebugLoc dl) { 133 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 134 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 135} 136 137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 138 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 139 bool is64Bit = Subtarget->is64Bit(); 140 141 if (Subtarget->isTargetEnvMacho()) { 142 if (is64Bit) 143 return new X86_64MachoTargetObjectFile(); 144 return new TargetLoweringObjectFileMachO(); 145 } 146 147 if (Subtarget->isTargetLinux()) 148 return new X86LinuxTargetObjectFile(); 149 if (Subtarget->isTargetELF()) 150 return new TargetLoweringObjectFileELF(); 151 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 152 return new TargetLoweringObjectFileCOFF(); 153 llvm_unreachable("unknown subtarget type"); 154} 155 156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 157 : TargetLowering(TM, createTLOF(TM)) { 158 Subtarget = &TM.getSubtarget<X86Subtarget>(); 159 X86ScalarSSEf64 = Subtarget->hasSSE2(); 160 X86ScalarSSEf32 = Subtarget->hasSSE1(); 161 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 162 163 RegInfo = TM.getRegisterInfo(); 164 TD = getDataLayout(); 165 166 // Set up the TargetLowering object. 167 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 168 169 // X86 is weird, it always uses i8 for shift amounts and setcc results. 170 setBooleanContents(ZeroOrOneBooleanContent); 171 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 172 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 173 174 // For 64-bit since we have so many registers use the ILP scheduler, for 175 // 32-bit code use the register pressure specific scheduling. 176 // For Atom, always use ILP scheduling. 177 if (Subtarget->isAtom()) 178 setSchedulingPreference(Sched::ILP); 179 else if (Subtarget->is64Bit()) 180 setSchedulingPreference(Sched::ILP); 181 else 182 setSchedulingPreference(Sched::RegPressure); 183 setStackPointerRegisterToSaveRestore(X86StackPtr); 184 185 // Bypass i32 with i8 on Atom when compiling with O2 186 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) 187 addBypassSlowDiv(32, 8); 188 189 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 190 // Setup Windows compiler runtime calls. 191 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 192 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 193 setLibcallName(RTLIB::SREM_I64, "_allrem"); 194 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 195 setLibcallName(RTLIB::MUL_I64, "_allmul"); 196 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 197 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 198 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 199 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 200 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 201 202 // The _ftol2 runtime function has an unusual calling conv, which 203 // is modeled by a special pseudo-instruction. 204 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 205 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 206 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 207 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, &X86::GR8RegClass); 225 addRegisterClass(MVT::i16, &X86::GR16RegClass); 226 addRegisterClass(MVT::i32, &X86::GR32RegClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, &X86::GR64RegClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 257 } else if (!TM.Options.UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!TM.Options.UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!TM.Options.UseSoftFloat) { 315 // Since AVX is a superset of SSE3, only check for SSE here. 316 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 317 // Expand FP_TO_UINT into a select. 318 // FIXME: We would like to use a Custom expander here eventually to do 319 // the optimal thing for SSE vs. the default expansion in the legalizer. 320 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 321 else 322 // With SSE3 we can use fisttpll to convert to a signed i64; without 323 // SSE, we're stuck with a fistpll. 324 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 325 } 326 327 if (isTargetFTOL()) { 328 // Use the _ftol2 runtime function, which has a pseudo-instruction 329 // to handle its weird calling convention. 330 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 331 } 332 333 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 334 if (!X86ScalarSSEf64) { 335 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 336 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 339 // Without SSE, i64->f64 goes through memory. 340 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 341 } 342 } 343 344 // Scalar integer divide and remainder are lowered to use operations that 345 // produce two results, to match the available instructions. This exposes 346 // the two-result form to trivial CSE, which is able to combine x/y and x%y 347 // into a single instruction. 348 // 349 // Scalar integer multiply-high is also lowered to use two-result 350 // operations, to match the available instructions. However, plain multiply 351 // (low) operations are left as Legal, as there are single-result 352 // instructions for this in x86. Using the two-result multiply instructions 353 // when both high and low results are needed must be arranged by dagcombine. 354 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 355 MVT VT = IntVTs[i]; 356 setOperationAction(ISD::MULHS, VT, Expand); 357 setOperationAction(ISD::MULHU, VT, Expand); 358 setOperationAction(ISD::SDIV, VT, Expand); 359 setOperationAction(ISD::UDIV, VT, Expand); 360 setOperationAction(ISD::SREM, VT, Expand); 361 setOperationAction(ISD::UREM, VT, Expand); 362 363 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 364 setOperationAction(ISD::ADDC, VT, Custom); 365 setOperationAction(ISD::ADDE, VT, Custom); 366 setOperationAction(ISD::SUBC, VT, Custom); 367 setOperationAction(ISD::SUBE, VT, Custom); 368 } 369 370 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 371 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 372 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 373 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 374 if (Subtarget->is64Bit()) 375 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 376 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 377 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 378 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 379 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 380 setOperationAction(ISD::FREM , MVT::f32 , Expand); 381 setOperationAction(ISD::FREM , MVT::f64 , Expand); 382 setOperationAction(ISD::FREM , MVT::f80 , Expand); 383 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 384 385 // Promote the i8 variants and force them on up to i32 which has a shorter 386 // encoding. 387 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 388 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 389 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 390 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 391 if (Subtarget->hasBMI()) { 392 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 393 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 394 if (Subtarget->is64Bit()) 395 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 396 } else { 397 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 398 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 399 if (Subtarget->is64Bit()) 400 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 401 } 402 403 if (Subtarget->hasLZCNT()) { 404 // When promoting the i8 variants, force them to i32 for a shorter 405 // encoding. 406 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 407 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 408 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 409 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 410 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 411 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 412 if (Subtarget->is64Bit()) 413 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 414 } else { 415 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 416 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 417 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 418 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 419 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 420 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 421 if (Subtarget->is64Bit()) { 422 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 423 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 424 } 425 } 426 427 if (Subtarget->hasPOPCNT()) { 428 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 429 } else { 430 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 431 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 432 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 433 if (Subtarget->is64Bit()) 434 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 435 } 436 437 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 438 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 439 440 // These should be promoted to a larger select which is supported. 441 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 442 // X86 wants to expand cmov itself. 443 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 444 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 445 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 446 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 447 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 448 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 449 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 450 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 451 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 452 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 453 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 454 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 455 if (Subtarget->is64Bit()) { 456 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 457 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 458 } 459 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 460 461 // Darwin ABI issue. 462 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 463 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 464 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 465 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 466 if (Subtarget->is64Bit()) 467 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 468 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 469 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 470 if (Subtarget->is64Bit()) { 471 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 472 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 473 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 474 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 475 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 476 } 477 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 478 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 479 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 480 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 481 if (Subtarget->is64Bit()) { 482 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 483 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 484 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 485 } 486 487 if (Subtarget->hasSSE1()) 488 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 489 490 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 491 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 492 493 // On X86 and X86-64, atomic operations are lowered to locked instructions. 494 // Locked instructions, in turn, have implicit fence semantics (all memory 495 // operations are flushed before issuing the locked instruction, and they 496 // are not buffered), so we can fold away the common pattern of 497 // fence-atomic-fence. 498 setShouldFoldAtomicFences(true); 499 500 // Expand certain atomics 501 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 502 MVT VT = IntVTs[i]; 503 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 504 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 505 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 506 } 507 508 if (!Subtarget->is64Bit()) { 509 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 511 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 512 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 513 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 514 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 515 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 516 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 517 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 518 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 519 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 520 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 521 } 522 523 if (Subtarget->hasCmpxchg16b()) { 524 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 525 } 526 527 // FIXME - use subtarget debug flags 528 if (!Subtarget->isTargetDarwin() && 529 !Subtarget->isTargetELF() && 530 !Subtarget->isTargetCygMing()) { 531 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 532 } 533 534 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 535 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 536 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 537 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 538 if (Subtarget->is64Bit()) { 539 setExceptionPointerRegister(X86::RAX); 540 setExceptionSelectorRegister(X86::RDX); 541 } else { 542 setExceptionPointerRegister(X86::EAX); 543 setExceptionSelectorRegister(X86::EDX); 544 } 545 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 546 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 547 548 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 549 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 550 551 setOperationAction(ISD::TRAP, MVT::Other, Legal); 552 553 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 554 setOperationAction(ISD::VASTART , MVT::Other, Custom); 555 setOperationAction(ISD::VAEND , MVT::Other, Expand); 556 if (Subtarget->is64Bit()) { 557 setOperationAction(ISD::VAARG , MVT::Other, Custom); 558 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 559 } else { 560 setOperationAction(ISD::VAARG , MVT::Other, Expand); 561 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 562 } 563 564 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 565 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 566 567 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 568 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 569 MVT::i64 : MVT::i32, Custom); 570 else if (TM.Options.EnableSegmentedStacks) 571 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 572 MVT::i64 : MVT::i32, Custom); 573 else 574 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 575 MVT::i64 : MVT::i32, Expand); 576 577 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 578 // f32 and f64 use SSE. 579 // Set up the FP register classes. 580 addRegisterClass(MVT::f32, &X86::FR32RegClass); 581 addRegisterClass(MVT::f64, &X86::FR64RegClass); 582 583 // Use ANDPD to simulate FABS. 584 setOperationAction(ISD::FABS , MVT::f64, Custom); 585 setOperationAction(ISD::FABS , MVT::f32, Custom); 586 587 // Use XORP to simulate FNEG. 588 setOperationAction(ISD::FNEG , MVT::f64, Custom); 589 setOperationAction(ISD::FNEG , MVT::f32, Custom); 590 591 // Use ANDPD and ORPD to simulate FCOPYSIGN. 592 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 593 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 594 595 // Lower this to FGETSIGNx86 plus an AND. 596 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 597 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 598 599 // We don't support sin/cos/fmod 600 setOperationAction(ISD::FSIN , MVT::f64, Expand); 601 setOperationAction(ISD::FCOS , MVT::f64, Expand); 602 setOperationAction(ISD::FSIN , MVT::f32, Expand); 603 setOperationAction(ISD::FCOS , MVT::f32, Expand); 604 605 // Expand FP immediates into loads from the stack, except for the special 606 // cases we handle. 607 addLegalFPImmediate(APFloat(+0.0)); // xorpd 608 addLegalFPImmediate(APFloat(+0.0f)); // xorps 609 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 610 // Use SSE for f32, x87 for f64. 611 // Set up the FP register classes. 612 addRegisterClass(MVT::f32, &X86::FR32RegClass); 613 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 614 615 // Use ANDPS to simulate FABS. 616 setOperationAction(ISD::FABS , MVT::f32, Custom); 617 618 // Use XORP to simulate FNEG. 619 setOperationAction(ISD::FNEG , MVT::f32, Custom); 620 621 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 622 623 // Use ANDPS and ORPS to simulate FCOPYSIGN. 624 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 625 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 626 627 // We don't support sin/cos/fmod 628 setOperationAction(ISD::FSIN , MVT::f32, Expand); 629 setOperationAction(ISD::FCOS , MVT::f32, Expand); 630 631 // Special cases we handle for FP constants. 632 addLegalFPImmediate(APFloat(+0.0f)); // xorps 633 addLegalFPImmediate(APFloat(+0.0)); // FLD0 634 addLegalFPImmediate(APFloat(+1.0)); // FLD1 635 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 636 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 637 638 if (!TM.Options.UnsafeFPMath) { 639 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 640 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 641 } 642 } else if (!TM.Options.UseSoftFloat) { 643 // f32 and f64 in x87. 644 // Set up the FP register classes. 645 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 646 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 647 648 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 649 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 650 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 651 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 652 653 if (!TM.Options.UnsafeFPMath) { 654 setOperationAction(ISD::FSIN , MVT::f32 , Expand); 655 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 656 setOperationAction(ISD::FCOS , MVT::f32 , Expand); 657 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 658 } 659 addLegalFPImmediate(APFloat(+0.0)); // FLD0 660 addLegalFPImmediate(APFloat(+1.0)); // FLD1 661 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 662 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 663 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 664 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 665 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 666 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 667 } 668 669 // We don't support FMA. 670 setOperationAction(ISD::FMA, MVT::f64, Expand); 671 setOperationAction(ISD::FMA, MVT::f32, Expand); 672 673 // Long double always uses X87. 674 if (!TM.Options.UseSoftFloat) { 675 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 676 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 677 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 678 { 679 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 680 addLegalFPImmediate(TmpFlt); // FLD0 681 TmpFlt.changeSign(); 682 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 683 684 bool ignored; 685 APFloat TmpFlt2(+1.0); 686 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 687 &ignored); 688 addLegalFPImmediate(TmpFlt2); // FLD1 689 TmpFlt2.changeSign(); 690 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 691 } 692 693 if (!TM.Options.UnsafeFPMath) { 694 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 695 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 696 } 697 698 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 699 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 700 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 701 setOperationAction(ISD::FRINT, MVT::f80, Expand); 702 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 703 setOperationAction(ISD::FMA, MVT::f80, Expand); 704 } 705 706 // Always use a library call for pow. 707 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 708 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 709 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 710 711 setOperationAction(ISD::FLOG, MVT::f80, Expand); 712 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 713 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 714 setOperationAction(ISD::FEXP, MVT::f80, Expand); 715 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 716 717 // First set operation action for all vector types to either promote 718 // (for widening) or expand (for scalarization). Then we will selectively 719 // turn on ones that can be effectively codegen'd. 720 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 721 VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) { 722 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 725 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 727 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 728 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 729 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 730 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 731 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 737 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 739 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 740 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 741 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 742 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 743 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 744 setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand); 745 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 746 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 747 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 748 setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); 749 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 750 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 751 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 752 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 753 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 754 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 755 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 756 setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 757 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 758 setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 759 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 760 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 761 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 762 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 763 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 764 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 765 setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); 766 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 767 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 768 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 769 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 770 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 771 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 772 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 773 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 774 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 775 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 776 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 777 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 778 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 779 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 780 setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); 781 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 782 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 783 setTruncStoreAction((MVT::SimpleValueType)VT, 784 (MVT::SimpleValueType)InnerVT, Expand); 785 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 786 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 787 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 788 } 789 790 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 791 // with -msoft-float, disable use of MMX as well. 792 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 793 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 794 // No operations on x86mmx supported, everything uses intrinsics. 795 } 796 797 // MMX-sized vectors (other than x86mmx) are expected to be expanded 798 // into smaller operations. 799 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 800 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 801 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 802 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 803 setOperationAction(ISD::AND, MVT::v8i8, Expand); 804 setOperationAction(ISD::AND, MVT::v4i16, Expand); 805 setOperationAction(ISD::AND, MVT::v2i32, Expand); 806 setOperationAction(ISD::AND, MVT::v1i64, Expand); 807 setOperationAction(ISD::OR, MVT::v8i8, Expand); 808 setOperationAction(ISD::OR, MVT::v4i16, Expand); 809 setOperationAction(ISD::OR, MVT::v2i32, Expand); 810 setOperationAction(ISD::OR, MVT::v1i64, Expand); 811 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 812 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 813 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 814 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 815 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 816 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 817 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 818 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 819 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 820 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 821 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 822 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 823 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 824 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 825 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 826 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 827 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 828 829 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 830 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 831 832 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 833 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 834 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 835 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 836 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 837 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 838 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 839 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 840 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 841 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 843 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 844 } 845 846 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 847 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 848 849 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 850 // registers cannot be used even for integer operations. 851 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 852 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 853 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 854 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 855 856 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 857 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 858 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 859 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 860 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 861 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 862 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 863 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 864 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 865 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 866 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 867 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 868 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 869 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 870 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 871 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 872 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 873 874 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 875 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 876 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 877 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 878 879 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 880 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 881 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 882 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 883 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 884 885 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 886 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 887 MVT VT = (MVT::SimpleValueType)i; 888 // Do not attempt to custom lower non-power-of-2 vectors 889 if (!isPowerOf2_32(VT.getVectorNumElements())) 890 continue; 891 // Do not attempt to custom lower non-128-bit vectors 892 if (!VT.is128BitVector()) 893 continue; 894 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 895 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 896 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 897 } 898 899 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 900 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 901 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 902 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 903 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 904 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 905 906 if (Subtarget->is64Bit()) { 907 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 908 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 909 } 910 911 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 912 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 913 MVT VT = (MVT::SimpleValueType)i; 914 915 // Do not attempt to promote non-128-bit vectors 916 if (!VT.is128BitVector()) 917 continue; 918 919 setOperationAction(ISD::AND, VT, Promote); 920 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 921 setOperationAction(ISD::OR, VT, Promote); 922 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 923 setOperationAction(ISD::XOR, VT, Promote); 924 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 925 setOperationAction(ISD::LOAD, VT, Promote); 926 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 927 setOperationAction(ISD::SELECT, VT, Promote); 928 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 929 } 930 931 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 932 933 // Custom lower v2i64 and v2f64 selects. 934 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 935 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 936 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 937 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 938 939 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 940 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 941 942 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 943 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 944 945 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 946 } 947 948 if (Subtarget->hasSSE41()) { 949 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 950 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 951 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 952 setOperationAction(ISD::FRINT, MVT::f32, Legal); 953 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 954 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 955 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 956 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 957 setOperationAction(ISD::FRINT, MVT::f64, Legal); 958 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 959 960 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 961 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 962 963 // FIXME: Do we need to handle scalar-to-vector here? 964 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 965 966 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 967 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 968 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 969 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 970 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 971 972 // i8 and i16 vectors are custom , because the source register and source 973 // source memory operand types are not the same width. f32 vectors are 974 // custom since the immediate controlling the insert encodes additional 975 // information. 976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 977 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 978 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 979 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 980 981 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 982 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 983 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 984 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 985 986 // FIXME: these should be Legal but thats only for the case where 987 // the index is constant. For now custom expand to deal with that. 988 if (Subtarget->is64Bit()) { 989 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 990 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 991 } 992 } 993 994 if (Subtarget->hasSSE2()) { 995 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 996 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 997 998 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 999 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1000 1001 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1002 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1003 1004 if (Subtarget->hasAVX2()) { 1005 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 1006 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 1007 1008 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 1009 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 1010 1011 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 1012 } else { 1013 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1014 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1015 1016 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1017 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1018 1019 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1020 } 1021 } 1022 1023 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { 1024 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1025 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1026 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1027 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1028 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1029 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1030 1031 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1032 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1033 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1034 1035 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1036 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1037 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1038 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1039 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1040 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1041 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1042 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1043 1044 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1045 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1046 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1047 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1048 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1049 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1050 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1051 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1052 1053 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1054 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1055 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1056 1057 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1058 1059 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1060 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1061 1062 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1063 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1064 1065 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1066 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1067 1068 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1069 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1070 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1071 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1072 1073 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1074 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1075 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1076 1077 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1078 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1079 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1080 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1081 1082 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1083 setOperationAction(ISD::FMA, MVT::v8f32, Custom); 1084 setOperationAction(ISD::FMA, MVT::v4f64, Custom); 1085 setOperationAction(ISD::FMA, MVT::v4f32, Custom); 1086 setOperationAction(ISD::FMA, MVT::v2f64, Custom); 1087 setOperationAction(ISD::FMA, MVT::f32, Custom); 1088 setOperationAction(ISD::FMA, MVT::f64, Custom); 1089 } 1090 1091 if (Subtarget->hasAVX2()) { 1092 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1093 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1094 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1095 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1096 1097 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1098 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1099 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1100 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1101 1102 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1103 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1104 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1105 // Don't lower v32i8 because there is no 128-bit byte mul 1106 1107 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1108 1109 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1110 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1111 1112 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1113 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1114 1115 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1116 } else { 1117 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1118 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1119 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1120 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1121 1122 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1123 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1124 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1125 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1126 1127 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1128 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1129 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1130 // Don't lower v32i8 because there is no 128-bit byte mul 1131 1132 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1133 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1134 1135 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1136 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1137 1138 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1139 } 1140 1141 // Custom lower several nodes for 256-bit types. 1142 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1143 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1144 MVT VT = (MVT::SimpleValueType)i; 1145 1146 // Extract subvector is special because the value type 1147 // (result) is 128-bit but the source is 256-bit wide. 1148 if (VT.is128BitVector()) 1149 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1150 1151 // Do not attempt to custom lower other non-256-bit vectors 1152 if (!VT.is256BitVector()) 1153 continue; 1154 1155 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1156 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1157 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1158 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1159 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1160 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1161 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1162 } 1163 1164 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1165 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1166 MVT VT = (MVT::SimpleValueType)i; 1167 1168 // Do not attempt to promote non-256-bit vectors 1169 if (!VT.is256BitVector()) 1170 continue; 1171 1172 setOperationAction(ISD::AND, VT, Promote); 1173 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1174 setOperationAction(ISD::OR, VT, Promote); 1175 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1176 setOperationAction(ISD::XOR, VT, Promote); 1177 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1178 setOperationAction(ISD::LOAD, VT, Promote); 1179 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1180 setOperationAction(ISD::SELECT, VT, Promote); 1181 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1182 } 1183 } 1184 1185 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1186 // of this type with custom code. 1187 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1188 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1189 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1190 Custom); 1191 } 1192 1193 // We want to custom lower some of our intrinsics. 1194 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1195 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1196 1197 1198 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1199 // handle type legalization for these operations here. 1200 // 1201 // FIXME: We really should do custom legalization for addition and 1202 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1203 // than generic legalization for 64-bit multiplication-with-overflow, though. 1204 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1205 // Add/Sub/Mul with overflow operations are custom lowered. 1206 MVT VT = IntVTs[i]; 1207 setOperationAction(ISD::SADDO, VT, Custom); 1208 setOperationAction(ISD::UADDO, VT, Custom); 1209 setOperationAction(ISD::SSUBO, VT, Custom); 1210 setOperationAction(ISD::USUBO, VT, Custom); 1211 setOperationAction(ISD::SMULO, VT, Custom); 1212 setOperationAction(ISD::UMULO, VT, Custom); 1213 } 1214 1215 // There are no 8-bit 3-address imul/mul instructions 1216 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1217 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1218 1219 if (!Subtarget->is64Bit()) { 1220 // These libcalls are not available in 32-bit. 1221 setLibcallName(RTLIB::SHL_I128, 0); 1222 setLibcallName(RTLIB::SRL_I128, 0); 1223 setLibcallName(RTLIB::SRA_I128, 0); 1224 } 1225 1226 // We have target-specific dag combine patterns for the following nodes: 1227 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1228 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1229 setTargetDAGCombine(ISD::VSELECT); 1230 setTargetDAGCombine(ISD::SELECT); 1231 setTargetDAGCombine(ISD::SHL); 1232 setTargetDAGCombine(ISD::SRA); 1233 setTargetDAGCombine(ISD::SRL); 1234 setTargetDAGCombine(ISD::OR); 1235 setTargetDAGCombine(ISD::AND); 1236 setTargetDAGCombine(ISD::ADD); 1237 setTargetDAGCombine(ISD::FADD); 1238 setTargetDAGCombine(ISD::FSUB); 1239 setTargetDAGCombine(ISD::FMA); 1240 setTargetDAGCombine(ISD::SUB); 1241 setTargetDAGCombine(ISD::LOAD); 1242 setTargetDAGCombine(ISD::STORE); 1243 setTargetDAGCombine(ISD::ZERO_EXTEND); 1244 setTargetDAGCombine(ISD::ANY_EXTEND); 1245 setTargetDAGCombine(ISD::SIGN_EXTEND); 1246 setTargetDAGCombine(ISD::TRUNCATE); 1247 setTargetDAGCombine(ISD::UINT_TO_FP); 1248 setTargetDAGCombine(ISD::SINT_TO_FP); 1249 setTargetDAGCombine(ISD::SETCC); 1250 setTargetDAGCombine(ISD::FP_TO_SINT); 1251 if (Subtarget->is64Bit()) 1252 setTargetDAGCombine(ISD::MUL); 1253 setTargetDAGCombine(ISD::XOR); 1254 1255 computeRegisterProperties(); 1256 1257 // On Darwin, -Os means optimize for size without hurting performance, 1258 // do not reduce the limit. 1259 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1260 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1261 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1262 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1263 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1264 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1265 setPrefLoopAlignment(4); // 2^4 bytes. 1266 benefitFromCodePlacementOpt = true; 1267 1268 // Predictable cmov don't hurt on atom because it's in-order. 1269 predictableSelectIsExpensive = !Subtarget->isAtom(); 1270 1271 setPrefFunctionAlignment(4); // 2^4 bytes. 1272} 1273 1274 1275EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1276 if (!VT.isVector()) return MVT::i8; 1277 return VT.changeVectorElementTypeToInteger(); 1278} 1279 1280 1281/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1282/// the desired ByVal argument alignment. 1283static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1284 if (MaxAlign == 16) 1285 return; 1286 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1287 if (VTy->getBitWidth() == 128) 1288 MaxAlign = 16; 1289 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1290 unsigned EltAlign = 0; 1291 getMaxByValAlign(ATy->getElementType(), EltAlign); 1292 if (EltAlign > MaxAlign) 1293 MaxAlign = EltAlign; 1294 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1295 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1296 unsigned EltAlign = 0; 1297 getMaxByValAlign(STy->getElementType(i), EltAlign); 1298 if (EltAlign > MaxAlign) 1299 MaxAlign = EltAlign; 1300 if (MaxAlign == 16) 1301 break; 1302 } 1303 } 1304} 1305 1306/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1307/// function arguments in the caller parameter area. For X86, aggregates 1308/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1309/// are at 4-byte boundaries. 1310unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1311 if (Subtarget->is64Bit()) { 1312 // Max of 8 and alignment of type. 1313 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1314 if (TyAlign > 8) 1315 return TyAlign; 1316 return 8; 1317 } 1318 1319 unsigned Align = 4; 1320 if (Subtarget->hasSSE1()) 1321 getMaxByValAlign(Ty, Align); 1322 return Align; 1323} 1324 1325/// getOptimalMemOpType - Returns the target specific optimal type for load 1326/// and store operations as a result of memset, memcpy, and memmove 1327/// lowering. If DstAlign is zero that means it's safe to destination 1328/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1329/// means there isn't a need to check it against alignment requirement, 1330/// probably because the source does not need to be loaded. If 1331/// 'IsZeroVal' is true, that means it's safe to return a 1332/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1333/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1334/// constant so it does not need to be loaded. 1335/// It returns EVT::Other if the type should be determined using generic 1336/// target-independent logic. 1337EVT 1338X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1339 unsigned DstAlign, unsigned SrcAlign, 1340 bool IsZeroVal, 1341 bool MemcpyStrSrc, 1342 MachineFunction &MF) const { 1343 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1344 // linux. This is because the stack realignment code can't handle certain 1345 // cases like PR2962. This should be removed when PR2962 is fixed. 1346 const Function *F = MF.getFunction(); 1347 if (IsZeroVal && 1348 !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { 1349 if (Size >= 16 && 1350 (Subtarget->isUnalignedMemAccessFast() || 1351 ((DstAlign == 0 || DstAlign >= 16) && 1352 (SrcAlign == 0 || SrcAlign >= 16))) && 1353 Subtarget->getStackAlignment() >= 16) { 1354 if (Subtarget->getStackAlignment() >= 32) { 1355 if (Subtarget->hasAVX2()) 1356 return MVT::v8i32; 1357 if (Subtarget->hasAVX()) 1358 return MVT::v8f32; 1359 } 1360 if (Subtarget->hasSSE2()) 1361 return MVT::v4i32; 1362 if (Subtarget->hasSSE1()) 1363 return MVT::v4f32; 1364 } else if (!MemcpyStrSrc && Size >= 8 && 1365 !Subtarget->is64Bit() && 1366 Subtarget->getStackAlignment() >= 8 && 1367 Subtarget->hasSSE2()) { 1368 // Do not use f64 to lower memcpy if source is string constant. It's 1369 // better to use i32 to avoid the loads. 1370 return MVT::f64; 1371 } 1372 } 1373 if (Subtarget->is64Bit() && Size >= 8) 1374 return MVT::i64; 1375 return MVT::i32; 1376} 1377 1378/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1379/// current function. The returned value is a member of the 1380/// MachineJumpTableInfo::JTEntryKind enum. 1381unsigned X86TargetLowering::getJumpTableEncoding() const { 1382 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1383 // symbol. 1384 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1385 Subtarget->isPICStyleGOT()) 1386 return MachineJumpTableInfo::EK_Custom32; 1387 1388 // Otherwise, use the normal jump table encoding heuristics. 1389 return TargetLowering::getJumpTableEncoding(); 1390} 1391 1392const MCExpr * 1393X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1394 const MachineBasicBlock *MBB, 1395 unsigned uid,MCContext &Ctx) const{ 1396 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1397 Subtarget->isPICStyleGOT()); 1398 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1399 // entries. 1400 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1401 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1402} 1403 1404/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1405/// jumptable. 1406SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1407 SelectionDAG &DAG) const { 1408 if (!Subtarget->is64Bit()) 1409 // This doesn't have DebugLoc associated with it, but is not really the 1410 // same as a Register. 1411 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1412 return Table; 1413} 1414 1415/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1416/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1417/// MCExpr. 1418const MCExpr *X86TargetLowering:: 1419getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1420 MCContext &Ctx) const { 1421 // X86-64 uses RIP relative addressing based on the jump table label. 1422 if (Subtarget->isPICStyleRIPRel()) 1423 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1424 1425 // Otherwise, the reference is relative to the PIC base. 1426 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1427} 1428 1429// FIXME: Why this routine is here? Move to RegInfo! 1430std::pair<const TargetRegisterClass*, uint8_t> 1431X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1432 const TargetRegisterClass *RRC = 0; 1433 uint8_t Cost = 1; 1434 switch (VT.getSimpleVT().SimpleTy) { 1435 default: 1436 return TargetLowering::findRepresentativeClass(VT); 1437 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1438 RRC = Subtarget->is64Bit() ? 1439 (const TargetRegisterClass*)&X86::GR64RegClass : 1440 (const TargetRegisterClass*)&X86::GR32RegClass; 1441 break; 1442 case MVT::x86mmx: 1443 RRC = &X86::VR64RegClass; 1444 break; 1445 case MVT::f32: case MVT::f64: 1446 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1447 case MVT::v4f32: case MVT::v2f64: 1448 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1449 case MVT::v4f64: 1450 RRC = &X86::VR128RegClass; 1451 break; 1452 } 1453 return std::make_pair(RRC, Cost); 1454} 1455 1456bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1457 unsigned &Offset) const { 1458 if (!Subtarget->isTargetLinux()) 1459 return false; 1460 1461 if (Subtarget->is64Bit()) { 1462 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1463 Offset = 0x28; 1464 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1465 AddressSpace = 256; 1466 else 1467 AddressSpace = 257; 1468 } else { 1469 // %gs:0x14 on i386 1470 Offset = 0x14; 1471 AddressSpace = 256; 1472 } 1473 return true; 1474} 1475 1476 1477//===----------------------------------------------------------------------===// 1478// Return Value Calling Convention Implementation 1479//===----------------------------------------------------------------------===// 1480 1481#include "X86GenCallingConv.inc" 1482 1483bool 1484X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1485 MachineFunction &MF, bool isVarArg, 1486 const SmallVectorImpl<ISD::OutputArg> &Outs, 1487 LLVMContext &Context) const { 1488 SmallVector<CCValAssign, 16> RVLocs; 1489 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1490 RVLocs, Context); 1491 return CCInfo.CheckReturn(Outs, RetCC_X86); 1492} 1493 1494SDValue 1495X86TargetLowering::LowerReturn(SDValue Chain, 1496 CallingConv::ID CallConv, bool isVarArg, 1497 const SmallVectorImpl<ISD::OutputArg> &Outs, 1498 const SmallVectorImpl<SDValue> &OutVals, 1499 DebugLoc dl, SelectionDAG &DAG) const { 1500 MachineFunction &MF = DAG.getMachineFunction(); 1501 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1502 1503 SmallVector<CCValAssign, 16> RVLocs; 1504 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1505 RVLocs, *DAG.getContext()); 1506 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1507 1508 // Add the regs to the liveout set for the function. 1509 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1510 for (unsigned i = 0; i != RVLocs.size(); ++i) 1511 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1512 MRI.addLiveOut(RVLocs[i].getLocReg()); 1513 1514 SDValue Flag; 1515 1516 SmallVector<SDValue, 6> RetOps; 1517 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1518 // Operand #1 = Bytes To Pop 1519 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1520 MVT::i16)); 1521 1522 // Copy the result values into the output registers. 1523 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1524 CCValAssign &VA = RVLocs[i]; 1525 assert(VA.isRegLoc() && "Can only return in registers!"); 1526 SDValue ValToCopy = OutVals[i]; 1527 EVT ValVT = ValToCopy.getValueType(); 1528 1529 // Promote values to the appropriate types 1530 if (VA.getLocInfo() == CCValAssign::SExt) 1531 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1532 else if (VA.getLocInfo() == CCValAssign::ZExt) 1533 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1534 else if (VA.getLocInfo() == CCValAssign::AExt) 1535 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1536 else if (VA.getLocInfo() == CCValAssign::BCvt) 1537 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1538 1539 // If this is x86-64, and we disabled SSE, we can't return FP values, 1540 // or SSE or MMX vectors. 1541 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1542 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1543 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1544 report_fatal_error("SSE register return with SSE disabled"); 1545 } 1546 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1547 // llvm-gcc has never done it right and no one has noticed, so this 1548 // should be OK for now. 1549 if (ValVT == MVT::f64 && 1550 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1551 report_fatal_error("SSE2 register return with SSE2 disabled"); 1552 1553 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1554 // the RET instruction and handled by the FP Stackifier. 1555 if (VA.getLocReg() == X86::ST0 || 1556 VA.getLocReg() == X86::ST1) { 1557 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1558 // change the value to the FP stack register class. 1559 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1560 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1561 RetOps.push_back(ValToCopy); 1562 // Don't emit a copytoreg. 1563 continue; 1564 } 1565 1566 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1567 // which is returned in RAX / RDX. 1568 if (Subtarget->is64Bit()) { 1569 if (ValVT == MVT::x86mmx) { 1570 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1571 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1572 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1573 ValToCopy); 1574 // If we don't have SSE2 available, convert to v4f32 so the generated 1575 // register is legal. 1576 if (!Subtarget->hasSSE2()) 1577 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1578 } 1579 } 1580 } 1581 1582 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1583 Flag = Chain.getValue(1); 1584 } 1585 1586 // The x86-64 ABI for returning structs by value requires that we copy 1587 // the sret argument into %rax for the return. We saved the argument into 1588 // a virtual register in the entry block, so now we copy the value out 1589 // and into %rax. 1590 if (Subtarget->is64Bit() && 1591 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1592 MachineFunction &MF = DAG.getMachineFunction(); 1593 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1594 unsigned Reg = FuncInfo->getSRetReturnReg(); 1595 assert(Reg && 1596 "SRetReturnReg should have been set in LowerFormalArguments()."); 1597 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1598 1599 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1600 Flag = Chain.getValue(1); 1601 1602 // RAX now acts like a return value. 1603 MRI.addLiveOut(X86::RAX); 1604 } 1605 1606 RetOps[0] = Chain; // Update chain. 1607 1608 // Add the flag if we have it. 1609 if (Flag.getNode()) 1610 RetOps.push_back(Flag); 1611 1612 return DAG.getNode(X86ISD::RET_FLAG, dl, 1613 MVT::Other, &RetOps[0], RetOps.size()); 1614} 1615 1616bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1617 if (N->getNumValues() != 1) 1618 return false; 1619 if (!N->hasNUsesOfValue(1, 0)) 1620 return false; 1621 1622 SDValue TCChain = Chain; 1623 SDNode *Copy = *N->use_begin(); 1624 if (Copy->getOpcode() == ISD::CopyToReg) { 1625 // If the copy has a glue operand, we conservatively assume it isn't safe to 1626 // perform a tail call. 1627 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1628 return false; 1629 TCChain = Copy->getOperand(0); 1630 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1631 return false; 1632 1633 bool HasRet = false; 1634 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1635 UI != UE; ++UI) { 1636 if (UI->getOpcode() != X86ISD::RET_FLAG) 1637 return false; 1638 HasRet = true; 1639 } 1640 1641 if (!HasRet) 1642 return false; 1643 1644 Chain = TCChain; 1645 return true; 1646} 1647 1648EVT 1649X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1650 ISD::NodeType ExtendKind) const { 1651 MVT ReturnMVT; 1652 // TODO: Is this also valid on 32-bit? 1653 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1654 ReturnMVT = MVT::i8; 1655 else 1656 ReturnMVT = MVT::i32; 1657 1658 EVT MinVT = getRegisterType(Context, ReturnMVT); 1659 return VT.bitsLT(MinVT) ? MinVT : VT; 1660} 1661 1662/// LowerCallResult - Lower the result values of a call into the 1663/// appropriate copies out of appropriate physical registers. 1664/// 1665SDValue 1666X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1667 CallingConv::ID CallConv, bool isVarArg, 1668 const SmallVectorImpl<ISD::InputArg> &Ins, 1669 DebugLoc dl, SelectionDAG &DAG, 1670 SmallVectorImpl<SDValue> &InVals) const { 1671 1672 // Assign locations to each value returned by this call. 1673 SmallVector<CCValAssign, 16> RVLocs; 1674 bool Is64Bit = Subtarget->is64Bit(); 1675 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1676 getTargetMachine(), RVLocs, *DAG.getContext()); 1677 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1678 1679 // Copy all of the result registers out of their specified physreg. 1680 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1681 CCValAssign &VA = RVLocs[i]; 1682 EVT CopyVT = VA.getValVT(); 1683 1684 // If this is x86-64, and we disabled SSE, we can't return FP values 1685 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1686 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1687 report_fatal_error("SSE register return with SSE disabled"); 1688 } 1689 1690 SDValue Val; 1691 1692 // If this is a call to a function that returns an fp value on the floating 1693 // point stack, we must guarantee the value is popped from the stack, so 1694 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1695 // if the return value is not used. We use the FpPOP_RETVAL instruction 1696 // instead. 1697 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1698 // If we prefer to use the value in xmm registers, copy it out as f80 and 1699 // use a truncate to move it from fp stack reg to xmm reg. 1700 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1701 SDValue Ops[] = { Chain, InFlag }; 1702 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1703 MVT::Other, MVT::Glue, Ops, 2), 1); 1704 Val = Chain.getValue(0); 1705 1706 // Round the f80 to the right size, which also moves it to the appropriate 1707 // xmm register. 1708 if (CopyVT != VA.getValVT()) 1709 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1710 // This truncation won't change the value. 1711 DAG.getIntPtrConstant(1)); 1712 } else { 1713 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1714 CopyVT, InFlag).getValue(1); 1715 Val = Chain.getValue(0); 1716 } 1717 InFlag = Chain.getValue(2); 1718 InVals.push_back(Val); 1719 } 1720 1721 return Chain; 1722} 1723 1724 1725//===----------------------------------------------------------------------===// 1726// C & StdCall & Fast Calling Convention implementation 1727//===----------------------------------------------------------------------===// 1728// StdCall calling convention seems to be standard for many Windows' API 1729// routines and around. It differs from C calling convention just a little: 1730// callee should clean up the stack, not caller. Symbols should be also 1731// decorated in some fancy way :) It doesn't support any vector arguments. 1732// For info on fast calling convention see Fast Calling Convention (tail call) 1733// implementation LowerX86_32FastCCCallTo. 1734 1735/// CallIsStructReturn - Determines whether a call uses struct return 1736/// semantics. 1737enum StructReturnType { 1738 NotStructReturn, 1739 RegStructReturn, 1740 StackStructReturn 1741}; 1742static StructReturnType 1743callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1744 if (Outs.empty()) 1745 return NotStructReturn; 1746 1747 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 1748 if (!Flags.isSRet()) 1749 return NotStructReturn; 1750 if (Flags.isInReg()) 1751 return RegStructReturn; 1752 return StackStructReturn; 1753} 1754 1755/// ArgsAreStructReturn - Determines whether a function uses struct 1756/// return semantics. 1757static StructReturnType 1758argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1759 if (Ins.empty()) 1760 return NotStructReturn; 1761 1762 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 1763 if (!Flags.isSRet()) 1764 return NotStructReturn; 1765 if (Flags.isInReg()) 1766 return RegStructReturn; 1767 return StackStructReturn; 1768} 1769 1770/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1771/// by "Src" to address "Dst" with size and alignment information specified by 1772/// the specific parameter attribute. The copy will be passed as a byval 1773/// function parameter. 1774static SDValue 1775CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1776 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1777 DebugLoc dl) { 1778 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1779 1780 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1781 /*isVolatile*/false, /*AlwaysInline=*/true, 1782 MachinePointerInfo(), MachinePointerInfo()); 1783} 1784 1785/// IsTailCallConvention - Return true if the calling convention is one that 1786/// supports tail call optimization. 1787static bool IsTailCallConvention(CallingConv::ID CC) { 1788 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1789} 1790 1791bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1792 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 1793 return false; 1794 1795 CallSite CS(CI); 1796 CallingConv::ID CalleeCC = CS.getCallingConv(); 1797 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1798 return false; 1799 1800 return true; 1801} 1802 1803/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1804/// a tailcall target by changing its ABI. 1805static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1806 bool GuaranteedTailCallOpt) { 1807 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1808} 1809 1810SDValue 1811X86TargetLowering::LowerMemArgument(SDValue Chain, 1812 CallingConv::ID CallConv, 1813 const SmallVectorImpl<ISD::InputArg> &Ins, 1814 DebugLoc dl, SelectionDAG &DAG, 1815 const CCValAssign &VA, 1816 MachineFrameInfo *MFI, 1817 unsigned i) const { 1818 // Create the nodes corresponding to a load from this parameter slot. 1819 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1820 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1821 getTargetMachine().Options.GuaranteedTailCallOpt); 1822 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1823 EVT ValVT; 1824 1825 // If value is passed by pointer we have address passed instead of the value 1826 // itself. 1827 if (VA.getLocInfo() == CCValAssign::Indirect) 1828 ValVT = VA.getLocVT(); 1829 else 1830 ValVT = VA.getValVT(); 1831 1832 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1833 // changed with more analysis. 1834 // In case of tail call optimization mark all arguments mutable. Since they 1835 // could be overwritten by lowering of arguments in case of a tail call. 1836 if (Flags.isByVal()) { 1837 unsigned Bytes = Flags.getByValSize(); 1838 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1839 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1840 return DAG.getFrameIndex(FI, getPointerTy()); 1841 } else { 1842 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1843 VA.getLocMemOffset(), isImmutable); 1844 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1845 return DAG.getLoad(ValVT, dl, Chain, FIN, 1846 MachinePointerInfo::getFixedStack(FI), 1847 false, false, false, 0); 1848 } 1849} 1850 1851SDValue 1852X86TargetLowering::LowerFormalArguments(SDValue Chain, 1853 CallingConv::ID CallConv, 1854 bool isVarArg, 1855 const SmallVectorImpl<ISD::InputArg> &Ins, 1856 DebugLoc dl, 1857 SelectionDAG &DAG, 1858 SmallVectorImpl<SDValue> &InVals) 1859 const { 1860 MachineFunction &MF = DAG.getMachineFunction(); 1861 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1862 1863 const Function* Fn = MF.getFunction(); 1864 if (Fn->hasExternalLinkage() && 1865 Subtarget->isTargetCygMing() && 1866 Fn->getName() == "main") 1867 FuncInfo->setForceFramePointer(true); 1868 1869 MachineFrameInfo *MFI = MF.getFrameInfo(); 1870 bool Is64Bit = Subtarget->is64Bit(); 1871 bool IsWindows = Subtarget->isTargetWindows(); 1872 bool IsWin64 = Subtarget->isTargetWin64(); 1873 1874 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1875 "Var args not supported with calling convention fastcc or ghc"); 1876 1877 // Assign locations to all of the incoming arguments. 1878 SmallVector<CCValAssign, 16> ArgLocs; 1879 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1880 ArgLocs, *DAG.getContext()); 1881 1882 // Allocate shadow area for Win64 1883 if (IsWin64) { 1884 CCInfo.AllocateStack(32, 8); 1885 } 1886 1887 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1888 1889 unsigned LastVal = ~0U; 1890 SDValue ArgValue; 1891 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1892 CCValAssign &VA = ArgLocs[i]; 1893 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1894 // places. 1895 assert(VA.getValNo() != LastVal && 1896 "Don't support value assigned to multiple locs yet"); 1897 (void)LastVal; 1898 LastVal = VA.getValNo(); 1899 1900 if (VA.isRegLoc()) { 1901 EVT RegVT = VA.getLocVT(); 1902 const TargetRegisterClass *RC; 1903 if (RegVT == MVT::i32) 1904 RC = &X86::GR32RegClass; 1905 else if (Is64Bit && RegVT == MVT::i64) 1906 RC = &X86::GR64RegClass; 1907 else if (RegVT == MVT::f32) 1908 RC = &X86::FR32RegClass; 1909 else if (RegVT == MVT::f64) 1910 RC = &X86::FR64RegClass; 1911 else if (RegVT.is256BitVector()) 1912 RC = &X86::VR256RegClass; 1913 else if (RegVT.is128BitVector()) 1914 RC = &X86::VR128RegClass; 1915 else if (RegVT == MVT::x86mmx) 1916 RC = &X86::VR64RegClass; 1917 else 1918 llvm_unreachable("Unknown argument type!"); 1919 1920 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1921 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1922 1923 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1924 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1925 // right size. 1926 if (VA.getLocInfo() == CCValAssign::SExt) 1927 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1928 DAG.getValueType(VA.getValVT())); 1929 else if (VA.getLocInfo() == CCValAssign::ZExt) 1930 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1931 DAG.getValueType(VA.getValVT())); 1932 else if (VA.getLocInfo() == CCValAssign::BCvt) 1933 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1934 1935 if (VA.isExtInLoc()) { 1936 // Handle MMX values passed in XMM regs. 1937 if (RegVT.isVector()) { 1938 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1939 ArgValue); 1940 } else 1941 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1942 } 1943 } else { 1944 assert(VA.isMemLoc()); 1945 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1946 } 1947 1948 // If value is passed via pointer - do a load. 1949 if (VA.getLocInfo() == CCValAssign::Indirect) 1950 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1951 MachinePointerInfo(), false, false, false, 0); 1952 1953 InVals.push_back(ArgValue); 1954 } 1955 1956 // The x86-64 ABI for returning structs by value requires that we copy 1957 // the sret argument into %rax for the return. Save the argument into 1958 // a virtual register so that we can access it from the return points. 1959 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1960 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1961 unsigned Reg = FuncInfo->getSRetReturnReg(); 1962 if (!Reg) { 1963 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1964 FuncInfo->setSRetReturnReg(Reg); 1965 } 1966 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1967 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1968 } 1969 1970 unsigned StackSize = CCInfo.getNextStackOffset(); 1971 // Align stack specially for tail calls. 1972 if (FuncIsMadeTailCallSafe(CallConv, 1973 MF.getTarget().Options.GuaranteedTailCallOpt)) 1974 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1975 1976 // If the function takes variable number of arguments, make a frame index for 1977 // the start of the first vararg value... for expansion of llvm.va_start. 1978 if (isVarArg) { 1979 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1980 CallConv != CallingConv::X86_ThisCall)) { 1981 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1982 } 1983 if (Is64Bit) { 1984 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1985 1986 // FIXME: We should really autogenerate these arrays 1987 static const uint16_t GPR64ArgRegsWin64[] = { 1988 X86::RCX, X86::RDX, X86::R8, X86::R9 1989 }; 1990 static const uint16_t GPR64ArgRegs64Bit[] = { 1991 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1992 }; 1993 static const uint16_t XMMArgRegs64Bit[] = { 1994 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1995 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1996 }; 1997 const uint16_t *GPR64ArgRegs; 1998 unsigned NumXMMRegs = 0; 1999 2000 if (IsWin64) { 2001 // The XMM registers which might contain var arg parameters are shadowed 2002 // in their paired GPR. So we only need to save the GPR to their home 2003 // slots. 2004 TotalNumIntRegs = 4; 2005 GPR64ArgRegs = GPR64ArgRegsWin64; 2006 } else { 2007 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2008 GPR64ArgRegs = GPR64ArgRegs64Bit; 2009 2010 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2011 TotalNumXMMRegs); 2012 } 2013 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2014 TotalNumIntRegs); 2015 2016 bool NoImplicitFloatOps = Fn->getFnAttributes(). 2017 hasAttribute(Attributes::NoImplicitFloat); 2018 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2019 "SSE register cannot be used when SSE is disabled!"); 2020 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2021 NoImplicitFloatOps) && 2022 "SSE register cannot be used when SSE is disabled!"); 2023 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2024 !Subtarget->hasSSE1()) 2025 // Kernel mode asks for SSE to be disabled, so don't push them 2026 // on the stack. 2027 TotalNumXMMRegs = 0; 2028 2029 if (IsWin64) { 2030 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2031 // Get to the caller-allocated home save location. Add 8 to account 2032 // for the return address. 2033 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2034 FuncInfo->setRegSaveFrameIndex( 2035 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2036 // Fixup to set vararg frame on shadow area (4 x i64). 2037 if (NumIntRegs < 4) 2038 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2039 } else { 2040 // For X86-64, if there are vararg parameters that are passed via 2041 // registers, then we must store them to their spots on the stack so 2042 // they may be loaded by deferencing the result of va_next. 2043 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2044 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2045 FuncInfo->setRegSaveFrameIndex( 2046 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2047 false)); 2048 } 2049 2050 // Store the integer parameter registers. 2051 SmallVector<SDValue, 8> MemOps; 2052 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2053 getPointerTy()); 2054 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2055 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2056 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2057 DAG.getIntPtrConstant(Offset)); 2058 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2059 &X86::GR64RegClass); 2060 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2061 SDValue Store = 2062 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2063 MachinePointerInfo::getFixedStack( 2064 FuncInfo->getRegSaveFrameIndex(), Offset), 2065 false, false, 0); 2066 MemOps.push_back(Store); 2067 Offset += 8; 2068 } 2069 2070 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2071 // Now store the XMM (fp + vector) parameter registers. 2072 SmallVector<SDValue, 11> SaveXMMOps; 2073 SaveXMMOps.push_back(Chain); 2074 2075 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2076 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2077 SaveXMMOps.push_back(ALVal); 2078 2079 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2080 FuncInfo->getRegSaveFrameIndex())); 2081 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2082 FuncInfo->getVarArgsFPOffset())); 2083 2084 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2085 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2086 &X86::VR128RegClass); 2087 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2088 SaveXMMOps.push_back(Val); 2089 } 2090 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2091 MVT::Other, 2092 &SaveXMMOps[0], SaveXMMOps.size())); 2093 } 2094 2095 if (!MemOps.empty()) 2096 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2097 &MemOps[0], MemOps.size()); 2098 } 2099 } 2100 2101 // Some CCs need callee pop. 2102 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2103 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2104 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2105 } else { 2106 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2107 // If this is an sret function, the return should pop the hidden pointer. 2108 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2109 argsAreStructReturn(Ins) == StackStructReturn) 2110 FuncInfo->setBytesToPopOnReturn(4); 2111 } 2112 2113 if (!Is64Bit) { 2114 // RegSaveFrameIndex is X86-64 only. 2115 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2116 if (CallConv == CallingConv::X86_FastCall || 2117 CallConv == CallingConv::X86_ThisCall) 2118 // fastcc functions can't have varargs. 2119 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2120 } 2121 2122 FuncInfo->setArgumentStackSize(StackSize); 2123 2124 return Chain; 2125} 2126 2127SDValue 2128X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2129 SDValue StackPtr, SDValue Arg, 2130 DebugLoc dl, SelectionDAG &DAG, 2131 const CCValAssign &VA, 2132 ISD::ArgFlagsTy Flags) const { 2133 unsigned LocMemOffset = VA.getLocMemOffset(); 2134 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2135 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2136 if (Flags.isByVal()) 2137 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2138 2139 return DAG.getStore(Chain, dl, Arg, PtrOff, 2140 MachinePointerInfo::getStack(LocMemOffset), 2141 false, false, 0); 2142} 2143 2144/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2145/// optimization is performed and it is required. 2146SDValue 2147X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2148 SDValue &OutRetAddr, SDValue Chain, 2149 bool IsTailCall, bool Is64Bit, 2150 int FPDiff, DebugLoc dl) const { 2151 // Adjust the Return address stack slot. 2152 EVT VT = getPointerTy(); 2153 OutRetAddr = getReturnAddressFrameIndex(DAG); 2154 2155 // Load the "old" Return address. 2156 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2157 false, false, false, 0); 2158 return SDValue(OutRetAddr.getNode(), 1); 2159} 2160 2161/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2162/// optimization is performed and it is required (FPDiff!=0). 2163static SDValue 2164EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2165 SDValue Chain, SDValue RetAddrFrIdx, 2166 bool Is64Bit, int FPDiff, DebugLoc dl) { 2167 // Store the return address to the appropriate stack slot. 2168 if (!FPDiff) return Chain; 2169 // Calculate the new stack slot for the return address. 2170 int SlotSize = Is64Bit ? 8 : 4; 2171 int NewReturnAddrFI = 2172 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2173 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2174 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 2175 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2176 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2177 false, false, 0); 2178 return Chain; 2179} 2180 2181SDValue 2182X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2183 SmallVectorImpl<SDValue> &InVals) const { 2184 SelectionDAG &DAG = CLI.DAG; 2185 DebugLoc &dl = CLI.DL; 2186 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2187 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2188 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2189 SDValue Chain = CLI.Chain; 2190 SDValue Callee = CLI.Callee; 2191 CallingConv::ID CallConv = CLI.CallConv; 2192 bool &isTailCall = CLI.IsTailCall; 2193 bool isVarArg = CLI.IsVarArg; 2194 2195 MachineFunction &MF = DAG.getMachineFunction(); 2196 bool Is64Bit = Subtarget->is64Bit(); 2197 bool IsWin64 = Subtarget->isTargetWin64(); 2198 bool IsWindows = Subtarget->isTargetWindows(); 2199 StructReturnType SR = callIsStructReturn(Outs); 2200 bool IsSibcall = false; 2201 2202 if (MF.getTarget().Options.DisableTailCalls) 2203 isTailCall = false; 2204 2205 if (isTailCall) { 2206 // Check if it's really possible to do a tail call. 2207 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2208 isVarArg, SR != NotStructReturn, 2209 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2210 Outs, OutVals, Ins, DAG); 2211 2212 // Sibcalls are automatically detected tailcalls which do not require 2213 // ABI changes. 2214 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2215 IsSibcall = true; 2216 2217 if (isTailCall) 2218 ++NumTailCalls; 2219 } 2220 2221 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2222 "Var args not supported with calling convention fastcc or ghc"); 2223 2224 // Analyze operands of the call, assigning locations to each operand. 2225 SmallVector<CCValAssign, 16> ArgLocs; 2226 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2227 ArgLocs, *DAG.getContext()); 2228 2229 // Allocate shadow area for Win64 2230 if (IsWin64) { 2231 CCInfo.AllocateStack(32, 8); 2232 } 2233 2234 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2235 2236 // Get a count of how many bytes are to be pushed on the stack. 2237 unsigned NumBytes = CCInfo.getNextStackOffset(); 2238 if (IsSibcall) 2239 // This is a sibcall. The memory operands are available in caller's 2240 // own caller's stack. 2241 NumBytes = 0; 2242 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2243 IsTailCallConvention(CallConv)) 2244 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2245 2246 int FPDiff = 0; 2247 if (isTailCall && !IsSibcall) { 2248 // Lower arguments at fp - stackoffset + fpdiff. 2249 unsigned NumBytesCallerPushed = 2250 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2251 FPDiff = NumBytesCallerPushed - NumBytes; 2252 2253 // Set the delta of movement of the returnaddr stackslot. 2254 // But only set if delta is greater than previous delta. 2255 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2256 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2257 } 2258 2259 if (!IsSibcall) 2260 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2261 2262 SDValue RetAddrFrIdx; 2263 // Load return address for tail calls. 2264 if (isTailCall && FPDiff) 2265 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2266 Is64Bit, FPDiff, dl); 2267 2268 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2269 SmallVector<SDValue, 8> MemOpChains; 2270 SDValue StackPtr; 2271 2272 // Walk the register/memloc assignments, inserting copies/loads. In the case 2273 // of tail call optimization arguments are handle later. 2274 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2275 CCValAssign &VA = ArgLocs[i]; 2276 EVT RegVT = VA.getLocVT(); 2277 SDValue Arg = OutVals[i]; 2278 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2279 bool isByVal = Flags.isByVal(); 2280 2281 // Promote the value if needed. 2282 switch (VA.getLocInfo()) { 2283 default: llvm_unreachable("Unknown loc info!"); 2284 case CCValAssign::Full: break; 2285 case CCValAssign::SExt: 2286 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2287 break; 2288 case CCValAssign::ZExt: 2289 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2290 break; 2291 case CCValAssign::AExt: 2292 if (RegVT.is128BitVector()) { 2293 // Special case: passing MMX values in XMM registers. 2294 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2295 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2296 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2297 } else 2298 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2299 break; 2300 case CCValAssign::BCvt: 2301 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2302 break; 2303 case CCValAssign::Indirect: { 2304 // Store the argument. 2305 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2306 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2307 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2308 MachinePointerInfo::getFixedStack(FI), 2309 false, false, 0); 2310 Arg = SpillSlot; 2311 break; 2312 } 2313 } 2314 2315 if (VA.isRegLoc()) { 2316 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2317 if (isVarArg && IsWin64) { 2318 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2319 // shadow reg if callee is a varargs function. 2320 unsigned ShadowReg = 0; 2321 switch (VA.getLocReg()) { 2322 case X86::XMM0: ShadowReg = X86::RCX; break; 2323 case X86::XMM1: ShadowReg = X86::RDX; break; 2324 case X86::XMM2: ShadowReg = X86::R8; break; 2325 case X86::XMM3: ShadowReg = X86::R9; break; 2326 } 2327 if (ShadowReg) 2328 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2329 } 2330 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2331 assert(VA.isMemLoc()); 2332 if (StackPtr.getNode() == 0) 2333 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2334 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2335 dl, DAG, VA, Flags)); 2336 } 2337 } 2338 2339 if (!MemOpChains.empty()) 2340 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2341 &MemOpChains[0], MemOpChains.size()); 2342 2343 if (Subtarget->isPICStyleGOT()) { 2344 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2345 // GOT pointer. 2346 if (!isTailCall) { 2347 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2348 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); 2349 } else { 2350 // If we are tail calling and generating PIC/GOT style code load the 2351 // address of the callee into ECX. The value in ecx is used as target of 2352 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2353 // for tail calls on PIC/GOT architectures. Normally we would just put the 2354 // address of GOT into ebx and then call target@PLT. But for tail calls 2355 // ebx would be restored (since ebx is callee saved) before jumping to the 2356 // target@PLT. 2357 2358 // Note: The actual moving to ECX is done further down. 2359 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2360 if (G && !G->getGlobal()->hasHiddenVisibility() && 2361 !G->getGlobal()->hasProtectedVisibility()) 2362 Callee = LowerGlobalAddress(Callee, DAG); 2363 else if (isa<ExternalSymbolSDNode>(Callee)) 2364 Callee = LowerExternalSymbol(Callee, DAG); 2365 } 2366 } 2367 2368 if (Is64Bit && isVarArg && !IsWin64) { 2369 // From AMD64 ABI document: 2370 // For calls that may call functions that use varargs or stdargs 2371 // (prototype-less calls or calls to functions containing ellipsis (...) in 2372 // the declaration) %al is used as hidden argument to specify the number 2373 // of SSE registers used. The contents of %al do not need to match exactly 2374 // the number of registers, but must be an ubound on the number of SSE 2375 // registers used and is in the range 0 - 8 inclusive. 2376 2377 // Count the number of XMM registers allocated. 2378 static const uint16_t XMMArgRegs[] = { 2379 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2380 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2381 }; 2382 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2383 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2384 && "SSE registers cannot be used when SSE is disabled"); 2385 2386 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2387 DAG.getConstant(NumXMMRegs, MVT::i8))); 2388 } 2389 2390 // For tail calls lower the arguments to the 'real' stack slot. 2391 if (isTailCall) { 2392 // Force all the incoming stack arguments to be loaded from the stack 2393 // before any new outgoing arguments are stored to the stack, because the 2394 // outgoing stack slots may alias the incoming argument stack slots, and 2395 // the alias isn't otherwise explicit. This is slightly more conservative 2396 // than necessary, because it means that each store effectively depends 2397 // on every argument instead of just those arguments it would clobber. 2398 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2399 2400 SmallVector<SDValue, 8> MemOpChains2; 2401 SDValue FIN; 2402 int FI = 0; 2403 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2404 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2405 CCValAssign &VA = ArgLocs[i]; 2406 if (VA.isRegLoc()) 2407 continue; 2408 assert(VA.isMemLoc()); 2409 SDValue Arg = OutVals[i]; 2410 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2411 // Create frame index. 2412 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2413 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2414 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2415 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2416 2417 if (Flags.isByVal()) { 2418 // Copy relative to framepointer. 2419 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2420 if (StackPtr.getNode() == 0) 2421 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2422 getPointerTy()); 2423 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2424 2425 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2426 ArgChain, 2427 Flags, DAG, dl)); 2428 } else { 2429 // Store relative to framepointer. 2430 MemOpChains2.push_back( 2431 DAG.getStore(ArgChain, dl, Arg, FIN, 2432 MachinePointerInfo::getFixedStack(FI), 2433 false, false, 0)); 2434 } 2435 } 2436 } 2437 2438 if (!MemOpChains2.empty()) 2439 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2440 &MemOpChains2[0], MemOpChains2.size()); 2441 2442 // Store the return address to the appropriate stack slot. 2443 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2444 FPDiff, dl); 2445 } 2446 2447 // Build a sequence of copy-to-reg nodes chained together with token chain 2448 // and flag operands which copy the outgoing args into registers. 2449 SDValue InFlag; 2450 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2451 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2452 RegsToPass[i].second, InFlag); 2453 InFlag = Chain.getValue(1); 2454 } 2455 2456 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2457 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2458 // In the 64-bit large code model, we have to make all calls 2459 // through a register, since the call instruction's 32-bit 2460 // pc-relative offset may not be large enough to hold the whole 2461 // address. 2462 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2463 // If the callee is a GlobalAddress node (quite common, every direct call 2464 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2465 // it. 2466 2467 // We should use extra load for direct calls to dllimported functions in 2468 // non-JIT mode. 2469 const GlobalValue *GV = G->getGlobal(); 2470 if (!GV->hasDLLImportLinkage()) { 2471 unsigned char OpFlags = 0; 2472 bool ExtraLoad = false; 2473 unsigned WrapperKind = ISD::DELETED_NODE; 2474 2475 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2476 // external symbols most go through the PLT in PIC mode. If the symbol 2477 // has hidden or protected visibility, or if it is static or local, then 2478 // we don't need to use the PLT - we can directly call it. 2479 if (Subtarget->isTargetELF() && 2480 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2481 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2482 OpFlags = X86II::MO_PLT; 2483 } else if (Subtarget->isPICStyleStubAny() && 2484 (GV->isDeclaration() || GV->isWeakForLinker()) && 2485 (!Subtarget->getTargetTriple().isMacOSX() || 2486 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2487 // PC-relative references to external symbols should go through $stub, 2488 // unless we're building with the leopard linker or later, which 2489 // automatically synthesizes these stubs. 2490 OpFlags = X86II::MO_DARWIN_STUB; 2491 } else if (Subtarget->isPICStyleRIPRel() && 2492 isa<Function>(GV) && 2493 cast<Function>(GV)->getFnAttributes(). 2494 hasAttribute(Attributes::NonLazyBind)) { 2495 // If the function is marked as non-lazy, generate an indirect call 2496 // which loads from the GOT directly. This avoids runtime overhead 2497 // at the cost of eager binding (and one extra byte of encoding). 2498 OpFlags = X86II::MO_GOTPCREL; 2499 WrapperKind = X86ISD::WrapperRIP; 2500 ExtraLoad = true; 2501 } 2502 2503 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2504 G->getOffset(), OpFlags); 2505 2506 // Add a wrapper if needed. 2507 if (WrapperKind != ISD::DELETED_NODE) 2508 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2509 // Add extra indirection if needed. 2510 if (ExtraLoad) 2511 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2512 MachinePointerInfo::getGOT(), 2513 false, false, false, 0); 2514 } 2515 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2516 unsigned char OpFlags = 0; 2517 2518 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2519 // external symbols should go through the PLT. 2520 if (Subtarget->isTargetELF() && 2521 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2522 OpFlags = X86II::MO_PLT; 2523 } else if (Subtarget->isPICStyleStubAny() && 2524 (!Subtarget->getTargetTriple().isMacOSX() || 2525 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2526 // PC-relative references to external symbols should go through $stub, 2527 // unless we're building with the leopard linker or later, which 2528 // automatically synthesizes these stubs. 2529 OpFlags = X86II::MO_DARWIN_STUB; 2530 } 2531 2532 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2533 OpFlags); 2534 } 2535 2536 // Returns a chain & a flag for retval copy to use. 2537 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2538 SmallVector<SDValue, 8> Ops; 2539 2540 if (!IsSibcall && isTailCall) { 2541 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2542 DAG.getIntPtrConstant(0, true), InFlag); 2543 InFlag = Chain.getValue(1); 2544 } 2545 2546 Ops.push_back(Chain); 2547 Ops.push_back(Callee); 2548 2549 if (isTailCall) 2550 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2551 2552 // Add argument registers to the end of the list so that they are known live 2553 // into the call. 2554 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2555 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2556 RegsToPass[i].second.getValueType())); 2557 2558 // Add a register mask operand representing the call-preserved registers. 2559 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2560 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2561 assert(Mask && "Missing call preserved mask for calling convention"); 2562 Ops.push_back(DAG.getRegisterMask(Mask)); 2563 2564 if (InFlag.getNode()) 2565 Ops.push_back(InFlag); 2566 2567 if (isTailCall) { 2568 // We used to do: 2569 //// If this is the first return lowered for this function, add the regs 2570 //// to the liveout set for the function. 2571 // This isn't right, although it's probably harmless on x86; liveouts 2572 // should be computed from returns not tail calls. Consider a void 2573 // function making a tail call to a function returning int. 2574 return DAG.getNode(X86ISD::TC_RETURN, dl, 2575 NodeTys, &Ops[0], Ops.size()); 2576 } 2577 2578 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2579 InFlag = Chain.getValue(1); 2580 2581 // Create the CALLSEQ_END node. 2582 unsigned NumBytesForCalleeToPush; 2583 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2584 getTargetMachine().Options.GuaranteedTailCallOpt)) 2585 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2586 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2587 SR == StackStructReturn) 2588 // If this is a call to a struct-return function, the callee 2589 // pops the hidden struct pointer, so we have to push it back. 2590 // This is common for Darwin/X86, Linux & Mingw32 targets. 2591 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2592 NumBytesForCalleeToPush = 4; 2593 else 2594 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2595 2596 // Returns a flag for retval copy to use. 2597 if (!IsSibcall) { 2598 Chain = DAG.getCALLSEQ_END(Chain, 2599 DAG.getIntPtrConstant(NumBytes, true), 2600 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2601 true), 2602 InFlag); 2603 InFlag = Chain.getValue(1); 2604 } 2605 2606 // Handle result values, copying them out of physregs into vregs that we 2607 // return. 2608 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2609 Ins, dl, DAG, InVals); 2610} 2611 2612 2613//===----------------------------------------------------------------------===// 2614// Fast Calling Convention (tail call) implementation 2615//===----------------------------------------------------------------------===// 2616 2617// Like std call, callee cleans arguments, convention except that ECX is 2618// reserved for storing the tail called function address. Only 2 registers are 2619// free for argument passing (inreg). Tail call optimization is performed 2620// provided: 2621// * tailcallopt is enabled 2622// * caller/callee are fastcc 2623// On X86_64 architecture with GOT-style position independent code only local 2624// (within module) calls are supported at the moment. 2625// To keep the stack aligned according to platform abi the function 2626// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2627// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2628// If a tail called function callee has more arguments than the caller the 2629// caller needs to make sure that there is room to move the RETADDR to. This is 2630// achieved by reserving an area the size of the argument delta right after the 2631// original REtADDR, but before the saved framepointer or the spilled registers 2632// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2633// stack layout: 2634// arg1 2635// arg2 2636// RETADDR 2637// [ new RETADDR 2638// move area ] 2639// (possible EBP) 2640// ESI 2641// EDI 2642// local1 .. 2643 2644/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2645/// for a 16 byte align requirement. 2646unsigned 2647X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2648 SelectionDAG& DAG) const { 2649 MachineFunction &MF = DAG.getMachineFunction(); 2650 const TargetMachine &TM = MF.getTarget(); 2651 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2652 unsigned StackAlignment = TFI.getStackAlignment(); 2653 uint64_t AlignMask = StackAlignment - 1; 2654 int64_t Offset = StackSize; 2655 uint64_t SlotSize = TD->getPointerSize(); 2656 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2657 // Number smaller than 12 so just add the difference. 2658 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2659 } else { 2660 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2661 Offset = ((~AlignMask) & Offset) + StackAlignment + 2662 (StackAlignment-SlotSize); 2663 } 2664 return Offset; 2665} 2666 2667/// MatchingStackOffset - Return true if the given stack call argument is 2668/// already available in the same position (relatively) of the caller's 2669/// incoming argument stack. 2670static 2671bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2672 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2673 const X86InstrInfo *TII) { 2674 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2675 int FI = INT_MAX; 2676 if (Arg.getOpcode() == ISD::CopyFromReg) { 2677 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2678 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2679 return false; 2680 MachineInstr *Def = MRI->getVRegDef(VR); 2681 if (!Def) 2682 return false; 2683 if (!Flags.isByVal()) { 2684 if (!TII->isLoadFromStackSlot(Def, FI)) 2685 return false; 2686 } else { 2687 unsigned Opcode = Def->getOpcode(); 2688 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2689 Def->getOperand(1).isFI()) { 2690 FI = Def->getOperand(1).getIndex(); 2691 Bytes = Flags.getByValSize(); 2692 } else 2693 return false; 2694 } 2695 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2696 if (Flags.isByVal()) 2697 // ByVal argument is passed in as a pointer but it's now being 2698 // dereferenced. e.g. 2699 // define @foo(%struct.X* %A) { 2700 // tail call @bar(%struct.X* byval %A) 2701 // } 2702 return false; 2703 SDValue Ptr = Ld->getBasePtr(); 2704 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2705 if (!FINode) 2706 return false; 2707 FI = FINode->getIndex(); 2708 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2709 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2710 FI = FINode->getIndex(); 2711 Bytes = Flags.getByValSize(); 2712 } else 2713 return false; 2714 2715 assert(FI != INT_MAX); 2716 if (!MFI->isFixedObjectIndex(FI)) 2717 return false; 2718 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2719} 2720 2721/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2722/// for tail call optimization. Targets which want to do tail call 2723/// optimization should implement this function. 2724bool 2725X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2726 CallingConv::ID CalleeCC, 2727 bool isVarArg, 2728 bool isCalleeStructRet, 2729 bool isCallerStructRet, 2730 Type *RetTy, 2731 const SmallVectorImpl<ISD::OutputArg> &Outs, 2732 const SmallVectorImpl<SDValue> &OutVals, 2733 const SmallVectorImpl<ISD::InputArg> &Ins, 2734 SelectionDAG& DAG) const { 2735 if (!IsTailCallConvention(CalleeCC) && 2736 CalleeCC != CallingConv::C) 2737 return false; 2738 2739 // If -tailcallopt is specified, make fastcc functions tail-callable. 2740 const MachineFunction &MF = DAG.getMachineFunction(); 2741 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2742 2743 // If the function return type is x86_fp80 and the callee return type is not, 2744 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2745 // perform a tailcall optimization here. 2746 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 2747 return false; 2748 2749 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2750 bool CCMatch = CallerCC == CalleeCC; 2751 2752 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2753 if (IsTailCallConvention(CalleeCC) && CCMatch) 2754 return true; 2755 return false; 2756 } 2757 2758 // Look for obvious safe cases to perform tail call optimization that do not 2759 // require ABI changes. This is what gcc calls sibcall. 2760 2761 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2762 // emit a special epilogue. 2763 if (RegInfo->needsStackRealignment(MF)) 2764 return false; 2765 2766 // Also avoid sibcall optimization if either caller or callee uses struct 2767 // return semantics. 2768 if (isCalleeStructRet || isCallerStructRet) 2769 return false; 2770 2771 // An stdcall caller is expected to clean up its arguments; the callee 2772 // isn't going to do that. 2773 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2774 return false; 2775 2776 // Do not sibcall optimize vararg calls unless all arguments are passed via 2777 // registers. 2778 if (isVarArg && !Outs.empty()) { 2779 2780 // Optimizing for varargs on Win64 is unlikely to be safe without 2781 // additional testing. 2782 if (Subtarget->isTargetWin64()) 2783 return false; 2784 2785 SmallVector<CCValAssign, 16> ArgLocs; 2786 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2787 getTargetMachine(), ArgLocs, *DAG.getContext()); 2788 2789 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2790 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2791 if (!ArgLocs[i].isRegLoc()) 2792 return false; 2793 } 2794 2795 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2796 // stack. Therefore, if it's not used by the call it is not safe to optimize 2797 // this into a sibcall. 2798 bool Unused = false; 2799 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2800 if (!Ins[i].Used) { 2801 Unused = true; 2802 break; 2803 } 2804 } 2805 if (Unused) { 2806 SmallVector<CCValAssign, 16> RVLocs; 2807 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2808 getTargetMachine(), RVLocs, *DAG.getContext()); 2809 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2810 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2811 CCValAssign &VA = RVLocs[i]; 2812 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2813 return false; 2814 } 2815 } 2816 2817 // If the calling conventions do not match, then we'd better make sure the 2818 // results are returned in the same way as what the caller expects. 2819 if (!CCMatch) { 2820 SmallVector<CCValAssign, 16> RVLocs1; 2821 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2822 getTargetMachine(), RVLocs1, *DAG.getContext()); 2823 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2824 2825 SmallVector<CCValAssign, 16> RVLocs2; 2826 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2827 getTargetMachine(), RVLocs2, *DAG.getContext()); 2828 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2829 2830 if (RVLocs1.size() != RVLocs2.size()) 2831 return false; 2832 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2833 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2834 return false; 2835 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2836 return false; 2837 if (RVLocs1[i].isRegLoc()) { 2838 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2839 return false; 2840 } else { 2841 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2842 return false; 2843 } 2844 } 2845 } 2846 2847 // If the callee takes no arguments then go on to check the results of the 2848 // call. 2849 if (!Outs.empty()) { 2850 // Check if stack adjustment is needed. For now, do not do this if any 2851 // argument is passed on the stack. 2852 SmallVector<CCValAssign, 16> ArgLocs; 2853 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2854 getTargetMachine(), ArgLocs, *DAG.getContext()); 2855 2856 // Allocate shadow area for Win64 2857 if (Subtarget->isTargetWin64()) { 2858 CCInfo.AllocateStack(32, 8); 2859 } 2860 2861 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2862 if (CCInfo.getNextStackOffset()) { 2863 MachineFunction &MF = DAG.getMachineFunction(); 2864 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2865 return false; 2866 2867 // Check if the arguments are already laid out in the right way as 2868 // the caller's fixed stack objects. 2869 MachineFrameInfo *MFI = MF.getFrameInfo(); 2870 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2871 const X86InstrInfo *TII = 2872 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2873 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2874 CCValAssign &VA = ArgLocs[i]; 2875 SDValue Arg = OutVals[i]; 2876 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2877 if (VA.getLocInfo() == CCValAssign::Indirect) 2878 return false; 2879 if (!VA.isRegLoc()) { 2880 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2881 MFI, MRI, TII)) 2882 return false; 2883 } 2884 } 2885 } 2886 2887 // If the tailcall address may be in a register, then make sure it's 2888 // possible to register allocate for it. In 32-bit, the call address can 2889 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2890 // callee-saved registers are restored. These happen to be the same 2891 // registers used to pass 'inreg' arguments so watch out for those. 2892 if (!Subtarget->is64Bit() && 2893 !isa<GlobalAddressSDNode>(Callee) && 2894 !isa<ExternalSymbolSDNode>(Callee)) { 2895 unsigned NumInRegs = 0; 2896 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2897 CCValAssign &VA = ArgLocs[i]; 2898 if (!VA.isRegLoc()) 2899 continue; 2900 unsigned Reg = VA.getLocReg(); 2901 switch (Reg) { 2902 default: break; 2903 case X86::EAX: case X86::EDX: case X86::ECX: 2904 if (++NumInRegs == 3) 2905 return false; 2906 break; 2907 } 2908 } 2909 } 2910 } 2911 2912 return true; 2913} 2914 2915FastISel * 2916X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 2917 const TargetLibraryInfo *libInfo) const { 2918 return X86::createFastISel(funcInfo, libInfo); 2919} 2920 2921 2922//===----------------------------------------------------------------------===// 2923// Other Lowering Hooks 2924//===----------------------------------------------------------------------===// 2925 2926static bool MayFoldLoad(SDValue Op) { 2927 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2928} 2929 2930static bool MayFoldIntoStore(SDValue Op) { 2931 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2932} 2933 2934static bool isTargetShuffle(unsigned Opcode) { 2935 switch(Opcode) { 2936 default: return false; 2937 case X86ISD::PSHUFD: 2938 case X86ISD::PSHUFHW: 2939 case X86ISD::PSHUFLW: 2940 case X86ISD::SHUFP: 2941 case X86ISD::PALIGN: 2942 case X86ISD::MOVLHPS: 2943 case X86ISD::MOVLHPD: 2944 case X86ISD::MOVHLPS: 2945 case X86ISD::MOVLPS: 2946 case X86ISD::MOVLPD: 2947 case X86ISD::MOVSHDUP: 2948 case X86ISD::MOVSLDUP: 2949 case X86ISD::MOVDDUP: 2950 case X86ISD::MOVSS: 2951 case X86ISD::MOVSD: 2952 case X86ISD::UNPCKL: 2953 case X86ISD::UNPCKH: 2954 case X86ISD::VPERMILP: 2955 case X86ISD::VPERM2X128: 2956 case X86ISD::VPERMI: 2957 return true; 2958 } 2959} 2960 2961static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2962 SDValue V1, SelectionDAG &DAG) { 2963 switch(Opc) { 2964 default: llvm_unreachable("Unknown x86 shuffle node"); 2965 case X86ISD::MOVSHDUP: 2966 case X86ISD::MOVSLDUP: 2967 case X86ISD::MOVDDUP: 2968 return DAG.getNode(Opc, dl, VT, V1); 2969 } 2970} 2971 2972static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2973 SDValue V1, unsigned TargetMask, 2974 SelectionDAG &DAG) { 2975 switch(Opc) { 2976 default: llvm_unreachable("Unknown x86 shuffle node"); 2977 case X86ISD::PSHUFD: 2978 case X86ISD::PSHUFHW: 2979 case X86ISD::PSHUFLW: 2980 case X86ISD::VPERMILP: 2981 case X86ISD::VPERMI: 2982 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2983 } 2984} 2985 2986static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2987 SDValue V1, SDValue V2, unsigned TargetMask, 2988 SelectionDAG &DAG) { 2989 switch(Opc) { 2990 default: llvm_unreachable("Unknown x86 shuffle node"); 2991 case X86ISD::PALIGN: 2992 case X86ISD::SHUFP: 2993 case X86ISD::VPERM2X128: 2994 return DAG.getNode(Opc, dl, VT, V1, V2, 2995 DAG.getConstant(TargetMask, MVT::i8)); 2996 } 2997} 2998 2999static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3000 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3001 switch(Opc) { 3002 default: llvm_unreachable("Unknown x86 shuffle node"); 3003 case X86ISD::MOVLHPS: 3004 case X86ISD::MOVLHPD: 3005 case X86ISD::MOVHLPS: 3006 case X86ISD::MOVLPS: 3007 case X86ISD::MOVLPD: 3008 case X86ISD::MOVSS: 3009 case X86ISD::MOVSD: 3010 case X86ISD::UNPCKL: 3011 case X86ISD::UNPCKH: 3012 return DAG.getNode(Opc, dl, VT, V1, V2); 3013 } 3014} 3015 3016SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3017 MachineFunction &MF = DAG.getMachineFunction(); 3018 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3019 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3020 3021 if (ReturnAddrIndex == 0) { 3022 // Set up a frame object for the return address. 3023 uint64_t SlotSize = TD->getPointerSize(); 3024 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 3025 false); 3026 FuncInfo->setRAIndex(ReturnAddrIndex); 3027 } 3028 3029 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3030} 3031 3032 3033bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3034 bool hasSymbolicDisplacement) { 3035 // Offset should fit into 32 bit immediate field. 3036 if (!isInt<32>(Offset)) 3037 return false; 3038 3039 // If we don't have a symbolic displacement - we don't have any extra 3040 // restrictions. 3041 if (!hasSymbolicDisplacement) 3042 return true; 3043 3044 // FIXME: Some tweaks might be needed for medium code model. 3045 if (M != CodeModel::Small && M != CodeModel::Kernel) 3046 return false; 3047 3048 // For small code model we assume that latest object is 16MB before end of 31 3049 // bits boundary. We may also accept pretty large negative constants knowing 3050 // that all objects are in the positive half of address space. 3051 if (M == CodeModel::Small && Offset < 16*1024*1024) 3052 return true; 3053 3054 // For kernel code model we know that all object resist in the negative half 3055 // of 32bits address space. We may not accept negative offsets, since they may 3056 // be just off and we may accept pretty large positive ones. 3057 if (M == CodeModel::Kernel && Offset > 0) 3058 return true; 3059 3060 return false; 3061} 3062 3063/// isCalleePop - Determines whether the callee is required to pop its 3064/// own arguments. Callee pop is necessary to support tail calls. 3065bool X86::isCalleePop(CallingConv::ID CallingConv, 3066 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3067 if (IsVarArg) 3068 return false; 3069 3070 switch (CallingConv) { 3071 default: 3072 return false; 3073 case CallingConv::X86_StdCall: 3074 return !is64Bit; 3075 case CallingConv::X86_FastCall: 3076 return !is64Bit; 3077 case CallingConv::X86_ThisCall: 3078 return !is64Bit; 3079 case CallingConv::Fast: 3080 return TailCallOpt; 3081 case CallingConv::GHC: 3082 return TailCallOpt; 3083 } 3084} 3085 3086/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3087/// specific condition code, returning the condition code and the LHS/RHS of the 3088/// comparison to make. 3089static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3090 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3091 if (!isFP) { 3092 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3093 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3094 // X > -1 -> X == 0, jump !sign. 3095 RHS = DAG.getConstant(0, RHS.getValueType()); 3096 return X86::COND_NS; 3097 } 3098 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3099 // X < 0 -> X == 0, jump on sign. 3100 return X86::COND_S; 3101 } 3102 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3103 // X < 1 -> X <= 0 3104 RHS = DAG.getConstant(0, RHS.getValueType()); 3105 return X86::COND_LE; 3106 } 3107 } 3108 3109 switch (SetCCOpcode) { 3110 default: llvm_unreachable("Invalid integer condition!"); 3111 case ISD::SETEQ: return X86::COND_E; 3112 case ISD::SETGT: return X86::COND_G; 3113 case ISD::SETGE: return X86::COND_GE; 3114 case ISD::SETLT: return X86::COND_L; 3115 case ISD::SETLE: return X86::COND_LE; 3116 case ISD::SETNE: return X86::COND_NE; 3117 case ISD::SETULT: return X86::COND_B; 3118 case ISD::SETUGT: return X86::COND_A; 3119 case ISD::SETULE: return X86::COND_BE; 3120 case ISD::SETUGE: return X86::COND_AE; 3121 } 3122 } 3123 3124 // First determine if it is required or is profitable to flip the operands. 3125 3126 // If LHS is a foldable load, but RHS is not, flip the condition. 3127 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3128 !ISD::isNON_EXTLoad(RHS.getNode())) { 3129 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3130 std::swap(LHS, RHS); 3131 } 3132 3133 switch (SetCCOpcode) { 3134 default: break; 3135 case ISD::SETOLT: 3136 case ISD::SETOLE: 3137 case ISD::SETUGT: 3138 case ISD::SETUGE: 3139 std::swap(LHS, RHS); 3140 break; 3141 } 3142 3143 // On a floating point condition, the flags are set as follows: 3144 // ZF PF CF op 3145 // 0 | 0 | 0 | X > Y 3146 // 0 | 0 | 1 | X < Y 3147 // 1 | 0 | 0 | X == Y 3148 // 1 | 1 | 1 | unordered 3149 switch (SetCCOpcode) { 3150 default: llvm_unreachable("Condcode should be pre-legalized away"); 3151 case ISD::SETUEQ: 3152 case ISD::SETEQ: return X86::COND_E; 3153 case ISD::SETOLT: // flipped 3154 case ISD::SETOGT: 3155 case ISD::SETGT: return X86::COND_A; 3156 case ISD::SETOLE: // flipped 3157 case ISD::SETOGE: 3158 case ISD::SETGE: return X86::COND_AE; 3159 case ISD::SETUGT: // flipped 3160 case ISD::SETULT: 3161 case ISD::SETLT: return X86::COND_B; 3162 case ISD::SETUGE: // flipped 3163 case ISD::SETULE: 3164 case ISD::SETLE: return X86::COND_BE; 3165 case ISD::SETONE: 3166 case ISD::SETNE: return X86::COND_NE; 3167 case ISD::SETUO: return X86::COND_P; 3168 case ISD::SETO: return X86::COND_NP; 3169 case ISD::SETOEQ: 3170 case ISD::SETUNE: return X86::COND_INVALID; 3171 } 3172} 3173 3174/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3175/// code. Current x86 isa includes the following FP cmov instructions: 3176/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3177static bool hasFPCMov(unsigned X86CC) { 3178 switch (X86CC) { 3179 default: 3180 return false; 3181 case X86::COND_B: 3182 case X86::COND_BE: 3183 case X86::COND_E: 3184 case X86::COND_P: 3185 case X86::COND_A: 3186 case X86::COND_AE: 3187 case X86::COND_NE: 3188 case X86::COND_NP: 3189 return true; 3190 } 3191} 3192 3193/// isFPImmLegal - Returns true if the target can instruction select the 3194/// specified FP immediate natively. If false, the legalizer will 3195/// materialize the FP immediate as a load from a constant pool. 3196bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3197 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3198 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3199 return true; 3200 } 3201 return false; 3202} 3203 3204/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3205/// the specified range (L, H]. 3206static bool isUndefOrInRange(int Val, int Low, int Hi) { 3207 return (Val < 0) || (Val >= Low && Val < Hi); 3208} 3209 3210/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3211/// specified value. 3212static bool isUndefOrEqual(int Val, int CmpVal) { 3213 if (Val < 0 || Val == CmpVal) 3214 return true; 3215 return false; 3216} 3217 3218/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3219/// from position Pos and ending in Pos+Size, falls within the specified 3220/// sequential range (L, L+Pos]. or is undef. 3221static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3222 unsigned Pos, unsigned Size, int Low) { 3223 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3224 if (!isUndefOrEqual(Mask[i], Low)) 3225 return false; 3226 return true; 3227} 3228 3229/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3230/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3231/// the second operand. 3232static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { 3233 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3234 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3235 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3236 return (Mask[0] < 2 && Mask[1] < 2); 3237 return false; 3238} 3239 3240/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3241/// is suitable for input to PSHUFHW. 3242static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3243 if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) 3244 return false; 3245 3246 // Lower quadword copied in order or undef. 3247 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3248 return false; 3249 3250 // Upper quadword shuffled. 3251 for (unsigned i = 4; i != 8; ++i) 3252 if (!isUndefOrInRange(Mask[i], 4, 8)) 3253 return false; 3254 3255 if (VT == MVT::v16i16) { 3256 // Lower quadword copied in order or undef. 3257 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3258 return false; 3259 3260 // Upper quadword shuffled. 3261 for (unsigned i = 12; i != 16; ++i) 3262 if (!isUndefOrInRange(Mask[i], 12, 16)) 3263 return false; 3264 } 3265 3266 return true; 3267} 3268 3269/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3270/// is suitable for input to PSHUFLW. 3271static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3272 if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) 3273 return false; 3274 3275 // Upper quadword copied in order. 3276 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3277 return false; 3278 3279 // Lower quadword shuffled. 3280 for (unsigned i = 0; i != 4; ++i) 3281 if (!isUndefOrInRange(Mask[i], 0, 4)) 3282 return false; 3283 3284 if (VT == MVT::v16i16) { 3285 // Upper quadword copied in order. 3286 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3287 return false; 3288 3289 // Lower quadword shuffled. 3290 for (unsigned i = 8; i != 12; ++i) 3291 if (!isUndefOrInRange(Mask[i], 8, 12)) 3292 return false; 3293 } 3294 3295 return true; 3296} 3297 3298/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3299/// is suitable for input to PALIGNR. 3300static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, 3301 const X86Subtarget *Subtarget) { 3302 if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || 3303 (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) 3304 return false; 3305 3306 unsigned NumElts = VT.getVectorNumElements(); 3307 unsigned NumLanes = VT.getSizeInBits()/128; 3308 unsigned NumLaneElts = NumElts/NumLanes; 3309 3310 // Do not handle 64-bit element shuffles with palignr. 3311 if (NumLaneElts == 2) 3312 return false; 3313 3314 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3315 unsigned i; 3316 for (i = 0; i != NumLaneElts; ++i) { 3317 if (Mask[i+l] >= 0) 3318 break; 3319 } 3320 3321 // Lane is all undef, go to next lane 3322 if (i == NumLaneElts) 3323 continue; 3324 3325 int Start = Mask[i+l]; 3326 3327 // Make sure its in this lane in one of the sources 3328 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3329 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3330 return false; 3331 3332 // If not lane 0, then we must match lane 0 3333 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3334 return false; 3335 3336 // Correct second source to be contiguous with first source 3337 if (Start >= (int)NumElts) 3338 Start -= NumElts - NumLaneElts; 3339 3340 // Make sure we're shifting in the right direction. 3341 if (Start <= (int)(i+l)) 3342 return false; 3343 3344 Start -= i; 3345 3346 // Check the rest of the elements to see if they are consecutive. 3347 for (++i; i != NumLaneElts; ++i) { 3348 int Idx = Mask[i+l]; 3349 3350 // Make sure its in this lane 3351 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3352 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3353 return false; 3354 3355 // If not lane 0, then we must match lane 0 3356 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3357 return false; 3358 3359 if (Idx >= (int)NumElts) 3360 Idx -= NumElts - NumLaneElts; 3361 3362 if (!isUndefOrEqual(Idx, Start+i)) 3363 return false; 3364 3365 } 3366 } 3367 3368 return true; 3369} 3370 3371/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3372/// the two vector operands have swapped position. 3373static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3374 unsigned NumElems) { 3375 for (unsigned i = 0; i != NumElems; ++i) { 3376 int idx = Mask[i]; 3377 if (idx < 0) 3378 continue; 3379 else if (idx < (int)NumElems) 3380 Mask[i] = idx + NumElems; 3381 else 3382 Mask[i] = idx - NumElems; 3383 } 3384} 3385 3386/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3387/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3388/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3389/// reverse of what x86 shuffles want. 3390static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, 3391 bool Commuted = false) { 3392 if (!HasAVX && VT.getSizeInBits() == 256) 3393 return false; 3394 3395 unsigned NumElems = VT.getVectorNumElements(); 3396 unsigned NumLanes = VT.getSizeInBits()/128; 3397 unsigned NumLaneElems = NumElems/NumLanes; 3398 3399 if (NumLaneElems != 2 && NumLaneElems != 4) 3400 return false; 3401 3402 // VSHUFPSY divides the resulting vector into 4 chunks. 3403 // The sources are also splitted into 4 chunks, and each destination 3404 // chunk must come from a different source chunk. 3405 // 3406 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3407 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3408 // 3409 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3410 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3411 // 3412 // VSHUFPDY divides the resulting vector into 4 chunks. 3413 // The sources are also splitted into 4 chunks, and each destination 3414 // chunk must come from a different source chunk. 3415 // 3416 // SRC1 => X3 X2 X1 X0 3417 // SRC2 => Y3 Y2 Y1 Y0 3418 // 3419 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3420 // 3421 unsigned HalfLaneElems = NumLaneElems/2; 3422 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3423 for (unsigned i = 0; i != NumLaneElems; ++i) { 3424 int Idx = Mask[i+l]; 3425 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3426 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3427 return false; 3428 // For VSHUFPSY, the mask of the second half must be the same as the 3429 // first but with the appropriate offsets. This works in the same way as 3430 // VPERMILPS works with masks. 3431 if (NumElems != 8 || l == 0 || Mask[i] < 0) 3432 continue; 3433 if (!isUndefOrEqual(Idx, Mask[i]+l)) 3434 return false; 3435 } 3436 } 3437 3438 return true; 3439} 3440 3441/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3442/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3443static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { 3444 if (!VT.is128BitVector()) 3445 return false; 3446 3447 unsigned NumElems = VT.getVectorNumElements(); 3448 3449 if (NumElems != 4) 3450 return false; 3451 3452 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3453 return isUndefOrEqual(Mask[0], 6) && 3454 isUndefOrEqual(Mask[1], 7) && 3455 isUndefOrEqual(Mask[2], 2) && 3456 isUndefOrEqual(Mask[3], 3); 3457} 3458 3459/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3460/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3461/// <2, 3, 2, 3> 3462static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { 3463 if (!VT.is128BitVector()) 3464 return false; 3465 3466 unsigned NumElems = VT.getVectorNumElements(); 3467 3468 if (NumElems != 4) 3469 return false; 3470 3471 return isUndefOrEqual(Mask[0], 2) && 3472 isUndefOrEqual(Mask[1], 3) && 3473 isUndefOrEqual(Mask[2], 2) && 3474 isUndefOrEqual(Mask[3], 3); 3475} 3476 3477/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3478/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3479static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { 3480 if (!VT.is128BitVector()) 3481 return false; 3482 3483 unsigned NumElems = VT.getVectorNumElements(); 3484 3485 if (NumElems != 2 && NumElems != 4) 3486 return false; 3487 3488 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3489 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3490 return false; 3491 3492 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3493 if (!isUndefOrEqual(Mask[i], i)) 3494 return false; 3495 3496 return true; 3497} 3498 3499/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3500/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3501static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { 3502 if (!VT.is128BitVector()) 3503 return false; 3504 3505 unsigned NumElems = VT.getVectorNumElements(); 3506 3507 if (NumElems != 2 && NumElems != 4) 3508 return false; 3509 3510 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3511 if (!isUndefOrEqual(Mask[i], i)) 3512 return false; 3513 3514 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3515 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3516 return false; 3517 3518 return true; 3519} 3520 3521// 3522// Some special combinations that can be optimized. 3523// 3524static 3525SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3526 SelectionDAG &DAG) { 3527 EVT VT = SVOp->getValueType(0); 3528 DebugLoc dl = SVOp->getDebugLoc(); 3529 3530 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3531 return SDValue(); 3532 3533 ArrayRef<int> Mask = SVOp->getMask(); 3534 3535 // These are the special masks that may be optimized. 3536 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3537 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3538 bool MatchEvenMask = true; 3539 bool MatchOddMask = true; 3540 for (int i=0; i<8; ++i) { 3541 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3542 MatchEvenMask = false; 3543 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3544 MatchOddMask = false; 3545 } 3546 3547 if (!MatchEvenMask && !MatchOddMask) 3548 return SDValue(); 3549 3550 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3551 3552 SDValue Op0 = SVOp->getOperand(0); 3553 SDValue Op1 = SVOp->getOperand(1); 3554 3555 if (MatchEvenMask) { 3556 // Shift the second operand right to 32 bits. 3557 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3558 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3559 } else { 3560 // Shift the first operand left to 32 bits. 3561 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3562 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3563 } 3564 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3565 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3566} 3567 3568/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3569/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3570static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, 3571 bool HasAVX2, bool V2IsSplat = false) { 3572 unsigned NumElts = VT.getVectorNumElements(); 3573 3574 assert((VT.is128BitVector() || VT.is256BitVector()) && 3575 "Unsupported vector type for unpckh"); 3576 3577 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3578 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3579 return false; 3580 3581 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3582 // independently on 128-bit lanes. 3583 unsigned NumLanes = VT.getSizeInBits()/128; 3584 unsigned NumLaneElts = NumElts/NumLanes; 3585 3586 for (unsigned l = 0; l != NumLanes; ++l) { 3587 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3588 i != (l+1)*NumLaneElts; 3589 i += 2, ++j) { 3590 int BitI = Mask[i]; 3591 int BitI1 = Mask[i+1]; 3592 if (!isUndefOrEqual(BitI, j)) 3593 return false; 3594 if (V2IsSplat) { 3595 if (!isUndefOrEqual(BitI1, NumElts)) 3596 return false; 3597 } else { 3598 if (!isUndefOrEqual(BitI1, j + NumElts)) 3599 return false; 3600 } 3601 } 3602 } 3603 3604 return true; 3605} 3606 3607/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3608/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3609static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, 3610 bool HasAVX2, bool V2IsSplat = false) { 3611 unsigned NumElts = VT.getVectorNumElements(); 3612 3613 assert((VT.is128BitVector() || VT.is256BitVector()) && 3614 "Unsupported vector type for unpckh"); 3615 3616 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3617 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3618 return false; 3619 3620 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3621 // independently on 128-bit lanes. 3622 unsigned NumLanes = VT.getSizeInBits()/128; 3623 unsigned NumLaneElts = NumElts/NumLanes; 3624 3625 for (unsigned l = 0; l != NumLanes; ++l) { 3626 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3627 i != (l+1)*NumLaneElts; i += 2, ++j) { 3628 int BitI = Mask[i]; 3629 int BitI1 = Mask[i+1]; 3630 if (!isUndefOrEqual(BitI, j)) 3631 return false; 3632 if (V2IsSplat) { 3633 if (isUndefOrEqual(BitI1, NumElts)) 3634 return false; 3635 } else { 3636 if (!isUndefOrEqual(BitI1, j+NumElts)) 3637 return false; 3638 } 3639 } 3640 } 3641 return true; 3642} 3643 3644/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3645/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3646/// <0, 0, 1, 1> 3647static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, 3648 bool HasAVX2) { 3649 unsigned NumElts = VT.getVectorNumElements(); 3650 3651 assert((VT.is128BitVector() || VT.is256BitVector()) && 3652 "Unsupported vector type for unpckh"); 3653 3654 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3655 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3656 return false; 3657 3658 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3659 // FIXME: Need a better way to get rid of this, there's no latency difference 3660 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3661 // the former later. We should also remove the "_undef" special mask. 3662 if (NumElts == 4 && VT.getSizeInBits() == 256) 3663 return false; 3664 3665 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3666 // independently on 128-bit lanes. 3667 unsigned NumLanes = VT.getSizeInBits()/128; 3668 unsigned NumLaneElts = NumElts/NumLanes; 3669 3670 for (unsigned l = 0; l != NumLanes; ++l) { 3671 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3672 i != (l+1)*NumLaneElts; 3673 i += 2, ++j) { 3674 int BitI = Mask[i]; 3675 int BitI1 = Mask[i+1]; 3676 3677 if (!isUndefOrEqual(BitI, j)) 3678 return false; 3679 if (!isUndefOrEqual(BitI1, j)) 3680 return false; 3681 } 3682 } 3683 3684 return true; 3685} 3686 3687/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3688/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3689/// <2, 2, 3, 3> 3690static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3691 unsigned NumElts = VT.getVectorNumElements(); 3692 3693 assert((VT.is128BitVector() || VT.is256BitVector()) && 3694 "Unsupported vector type for unpckh"); 3695 3696 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3697 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3698 return false; 3699 3700 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3701 // independently on 128-bit lanes. 3702 unsigned NumLanes = VT.getSizeInBits()/128; 3703 unsigned NumLaneElts = NumElts/NumLanes; 3704 3705 for (unsigned l = 0; l != NumLanes; ++l) { 3706 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3707 i != (l+1)*NumLaneElts; i += 2, ++j) { 3708 int BitI = Mask[i]; 3709 int BitI1 = Mask[i+1]; 3710 if (!isUndefOrEqual(BitI, j)) 3711 return false; 3712 if (!isUndefOrEqual(BitI1, j)) 3713 return false; 3714 } 3715 } 3716 return true; 3717} 3718 3719/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3720/// specifies a shuffle of elements that is suitable for input to MOVSS, 3721/// MOVSD, and MOVD, i.e. setting the lowest element. 3722static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 3723 if (VT.getVectorElementType().getSizeInBits() < 32) 3724 return false; 3725 if (!VT.is128BitVector()) 3726 return false; 3727 3728 unsigned NumElts = VT.getVectorNumElements(); 3729 3730 if (!isUndefOrEqual(Mask[0], NumElts)) 3731 return false; 3732 3733 for (unsigned i = 1; i != NumElts; ++i) 3734 if (!isUndefOrEqual(Mask[i], i)) 3735 return false; 3736 3737 return true; 3738} 3739 3740/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3741/// as permutations between 128-bit chunks or halves. As an example: this 3742/// shuffle bellow: 3743/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3744/// The first half comes from the second half of V1 and the second half from the 3745/// the second half of V2. 3746static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3747 if (!HasAVX || !VT.is256BitVector()) 3748 return false; 3749 3750 // The shuffle result is divided into half A and half B. In total the two 3751 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3752 // B must come from C, D, E or F. 3753 unsigned HalfSize = VT.getVectorNumElements()/2; 3754 bool MatchA = false, MatchB = false; 3755 3756 // Check if A comes from one of C, D, E, F. 3757 for (unsigned Half = 0; Half != 4; ++Half) { 3758 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3759 MatchA = true; 3760 break; 3761 } 3762 } 3763 3764 // Check if B comes from one of C, D, E, F. 3765 for (unsigned Half = 0; Half != 4; ++Half) { 3766 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3767 MatchB = true; 3768 break; 3769 } 3770 } 3771 3772 return MatchA && MatchB; 3773} 3774 3775/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3776/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3777static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3778 EVT VT = SVOp->getValueType(0); 3779 3780 unsigned HalfSize = VT.getVectorNumElements()/2; 3781 3782 unsigned FstHalf = 0, SndHalf = 0; 3783 for (unsigned i = 0; i < HalfSize; ++i) { 3784 if (SVOp->getMaskElt(i) > 0) { 3785 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3786 break; 3787 } 3788 } 3789 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 3790 if (SVOp->getMaskElt(i) > 0) { 3791 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3792 break; 3793 } 3794 } 3795 3796 return (FstHalf | (SndHalf << 4)); 3797} 3798 3799/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3800/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3801/// Note that VPERMIL mask matching is different depending whether theunderlying 3802/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3803/// to the same elements of the low, but to the higher half of the source. 3804/// In VPERMILPD the two lanes could be shuffled independently of each other 3805/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 3806static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3807 if (!HasAVX) 3808 return false; 3809 3810 unsigned NumElts = VT.getVectorNumElements(); 3811 // Only match 256-bit with 32/64-bit types 3812 if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) 3813 return false; 3814 3815 unsigned NumLanes = VT.getSizeInBits()/128; 3816 unsigned LaneSize = NumElts/NumLanes; 3817 for (unsigned l = 0; l != NumElts; l += LaneSize) { 3818 for (unsigned i = 0; i != LaneSize; ++i) { 3819 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 3820 return false; 3821 if (NumElts != 8 || l == 0) 3822 continue; 3823 // VPERMILPS handling 3824 if (Mask[i] < 0) 3825 continue; 3826 if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) 3827 return false; 3828 } 3829 } 3830 3831 return true; 3832} 3833 3834/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 3835/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3836/// element of vector 2 and the other elements to come from vector 1 in order. 3837static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, 3838 bool V2IsSplat = false, bool V2IsUndef = false) { 3839 if (!VT.is128BitVector()) 3840 return false; 3841 3842 unsigned NumOps = VT.getVectorNumElements(); 3843 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3844 return false; 3845 3846 if (!isUndefOrEqual(Mask[0], 0)) 3847 return false; 3848 3849 for (unsigned i = 1; i != NumOps; ++i) 3850 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3851 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3852 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3853 return false; 3854 3855 return true; 3856} 3857 3858/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3859/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3860/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3861static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, 3862 const X86Subtarget *Subtarget) { 3863 if (!Subtarget->hasSSE3()) 3864 return false; 3865 3866 unsigned NumElems = VT.getVectorNumElements(); 3867 3868 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3869 (VT.getSizeInBits() == 256 && NumElems != 8)) 3870 return false; 3871 3872 // "i+1" is the value the indexed mask element must have 3873 for (unsigned i = 0; i != NumElems; i += 2) 3874 if (!isUndefOrEqual(Mask[i], i+1) || 3875 !isUndefOrEqual(Mask[i+1], i+1)) 3876 return false; 3877 3878 return true; 3879} 3880 3881/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3882/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3883/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3884static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, 3885 const X86Subtarget *Subtarget) { 3886 if (!Subtarget->hasSSE3()) 3887 return false; 3888 3889 unsigned NumElems = VT.getVectorNumElements(); 3890 3891 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3892 (VT.getSizeInBits() == 256 && NumElems != 8)) 3893 return false; 3894 3895 // "i" is the value the indexed mask element must have 3896 for (unsigned i = 0; i != NumElems; i += 2) 3897 if (!isUndefOrEqual(Mask[i], i) || 3898 !isUndefOrEqual(Mask[i+1], i)) 3899 return false; 3900 3901 return true; 3902} 3903 3904/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3905/// specifies a shuffle of elements that is suitable for input to 256-bit 3906/// version of MOVDDUP. 3907static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3908 if (!HasAVX || !VT.is256BitVector()) 3909 return false; 3910 3911 unsigned NumElts = VT.getVectorNumElements(); 3912 if (NumElts != 4) 3913 return false; 3914 3915 for (unsigned i = 0; i != NumElts/2; ++i) 3916 if (!isUndefOrEqual(Mask[i], 0)) 3917 return false; 3918 for (unsigned i = NumElts/2; i != NumElts; ++i) 3919 if (!isUndefOrEqual(Mask[i], NumElts/2)) 3920 return false; 3921 return true; 3922} 3923 3924/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3925/// specifies a shuffle of elements that is suitable for input to 128-bit 3926/// version of MOVDDUP. 3927static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { 3928 if (!VT.is128BitVector()) 3929 return false; 3930 3931 unsigned e = VT.getVectorNumElements() / 2; 3932 for (unsigned i = 0; i != e; ++i) 3933 if (!isUndefOrEqual(Mask[i], i)) 3934 return false; 3935 for (unsigned i = 0; i != e; ++i) 3936 if (!isUndefOrEqual(Mask[e+i], i)) 3937 return false; 3938 return true; 3939} 3940 3941/// isVEXTRACTF128Index - Return true if the specified 3942/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3943/// suitable for input to VEXTRACTF128. 3944bool X86::isVEXTRACTF128Index(SDNode *N) { 3945 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3946 return false; 3947 3948 // The index should be aligned on a 128-bit boundary. 3949 uint64_t Index = 3950 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3951 3952 unsigned VL = N->getValueType(0).getVectorNumElements(); 3953 unsigned VBits = N->getValueType(0).getSizeInBits(); 3954 unsigned ElSize = VBits / VL; 3955 bool Result = (Index * ElSize) % 128 == 0; 3956 3957 return Result; 3958} 3959 3960/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3961/// operand specifies a subvector insert that is suitable for input to 3962/// VINSERTF128. 3963bool X86::isVINSERTF128Index(SDNode *N) { 3964 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3965 return false; 3966 3967 // The index should be aligned on a 128-bit boundary. 3968 uint64_t Index = 3969 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3970 3971 unsigned VL = N->getValueType(0).getVectorNumElements(); 3972 unsigned VBits = N->getValueType(0).getSizeInBits(); 3973 unsigned ElSize = VBits / VL; 3974 bool Result = (Index * ElSize) % 128 == 0; 3975 3976 return Result; 3977} 3978 3979/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3980/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3981/// Handles 128-bit and 256-bit. 3982static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 3983 EVT VT = N->getValueType(0); 3984 3985 assert((VT.is128BitVector() || VT.is256BitVector()) && 3986 "Unsupported vector type for PSHUF/SHUFP"); 3987 3988 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 3989 // independently on 128-bit lanes. 3990 unsigned NumElts = VT.getVectorNumElements(); 3991 unsigned NumLanes = VT.getSizeInBits()/128; 3992 unsigned NumLaneElts = NumElts/NumLanes; 3993 3994 assert((NumLaneElts == 2 || NumLaneElts == 4) && 3995 "Only supports 2 or 4 elements per lane"); 3996 3997 unsigned Shift = (NumLaneElts == 4) ? 1 : 0; 3998 unsigned Mask = 0; 3999 for (unsigned i = 0; i != NumElts; ++i) { 4000 int Elt = N->getMaskElt(i); 4001 if (Elt < 0) continue; 4002 Elt &= NumLaneElts - 1; 4003 unsigned ShAmt = (i << Shift) % 8; 4004 Mask |= Elt << ShAmt; 4005 } 4006 4007 return Mask; 4008} 4009 4010/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4011/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4012static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4013 EVT VT = N->getValueType(0); 4014 4015 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4016 "Unsupported vector type for PSHUFHW"); 4017 4018 unsigned NumElts = VT.getVectorNumElements(); 4019 4020 unsigned Mask = 0; 4021 for (unsigned l = 0; l != NumElts; l += 8) { 4022 // 8 nodes per lane, but we only care about the last 4. 4023 for (unsigned i = 0; i < 4; ++i) { 4024 int Elt = N->getMaskElt(l+i+4); 4025 if (Elt < 0) continue; 4026 Elt &= 0x3; // only 2-bits. 4027 Mask |= Elt << (i * 2); 4028 } 4029 } 4030 4031 return Mask; 4032} 4033 4034/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4035/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4036static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4037 EVT VT = N->getValueType(0); 4038 4039 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4040 "Unsupported vector type for PSHUFHW"); 4041 4042 unsigned NumElts = VT.getVectorNumElements(); 4043 4044 unsigned Mask = 0; 4045 for (unsigned l = 0; l != NumElts; l += 8) { 4046 // 8 nodes per lane, but we only care about the first 4. 4047 for (unsigned i = 0; i < 4; ++i) { 4048 int Elt = N->getMaskElt(l+i); 4049 if (Elt < 0) continue; 4050 Elt &= 0x3; // only 2-bits 4051 Mask |= Elt << (i * 2); 4052 } 4053 } 4054 4055 return Mask; 4056} 4057 4058/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4059/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4060static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4061 EVT VT = SVOp->getValueType(0); 4062 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 4063 4064 unsigned NumElts = VT.getVectorNumElements(); 4065 unsigned NumLanes = VT.getSizeInBits()/128; 4066 unsigned NumLaneElts = NumElts/NumLanes; 4067 4068 int Val = 0; 4069 unsigned i; 4070 for (i = 0; i != NumElts; ++i) { 4071 Val = SVOp->getMaskElt(i); 4072 if (Val >= 0) 4073 break; 4074 } 4075 if (Val >= (int)NumElts) 4076 Val -= NumElts - NumLaneElts; 4077 4078 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4079 return (Val - i) * EltSize; 4080} 4081 4082/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4083/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4084/// instructions. 4085unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4086 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4087 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4088 4089 uint64_t Index = 4090 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4091 4092 EVT VecVT = N->getOperand(0).getValueType(); 4093 EVT ElVT = VecVT.getVectorElementType(); 4094 4095 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4096 return Index / NumElemsPerChunk; 4097} 4098 4099/// getInsertVINSERTF128Immediate - Return the appropriate immediate 4100/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4101/// instructions. 4102unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4103 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4104 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4105 4106 uint64_t Index = 4107 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4108 4109 EVT VecVT = N->getValueType(0); 4110 EVT ElVT = VecVT.getVectorElementType(); 4111 4112 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4113 return Index / NumElemsPerChunk; 4114} 4115 4116/// getShuffleCLImmediate - Return the appropriate immediate to shuffle 4117/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. 4118/// Handles 256-bit. 4119static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { 4120 EVT VT = N->getValueType(0); 4121 4122 unsigned NumElts = VT.getVectorNumElements(); 4123 4124 assert((VT.is256BitVector() && NumElts == 4) && 4125 "Unsupported vector type for VPERMQ/VPERMPD"); 4126 4127 unsigned Mask = 0; 4128 for (unsigned i = 0; i != NumElts; ++i) { 4129 int Elt = N->getMaskElt(i); 4130 if (Elt < 0) 4131 continue; 4132 Mask |= Elt << (i*2); 4133 } 4134 4135 return Mask; 4136} 4137/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4138/// constant +0.0. 4139bool X86::isZeroNode(SDValue Elt) { 4140 return ((isa<ConstantSDNode>(Elt) && 4141 cast<ConstantSDNode>(Elt)->isNullValue()) || 4142 (isa<ConstantFPSDNode>(Elt) && 4143 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 4144} 4145 4146/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4147/// their permute mask. 4148static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4149 SelectionDAG &DAG) { 4150 EVT VT = SVOp->getValueType(0); 4151 unsigned NumElems = VT.getVectorNumElements(); 4152 SmallVector<int, 8> MaskVec; 4153 4154 for (unsigned i = 0; i != NumElems; ++i) { 4155 int Idx = SVOp->getMaskElt(i); 4156 if (Idx >= 0) { 4157 if (Idx < (int)NumElems) 4158 Idx += NumElems; 4159 else 4160 Idx -= NumElems; 4161 } 4162 MaskVec.push_back(Idx); 4163 } 4164 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4165 SVOp->getOperand(0), &MaskVec[0]); 4166} 4167 4168/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4169/// match movhlps. The lower half elements should come from upper half of 4170/// V1 (and in order), and the upper half elements should come from the upper 4171/// half of V2 (and in order). 4172static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { 4173 if (!VT.is128BitVector()) 4174 return false; 4175 if (VT.getVectorNumElements() != 4) 4176 return false; 4177 for (unsigned i = 0, e = 2; i != e; ++i) 4178 if (!isUndefOrEqual(Mask[i], i+2)) 4179 return false; 4180 for (unsigned i = 2; i != 4; ++i) 4181 if (!isUndefOrEqual(Mask[i], i+4)) 4182 return false; 4183 return true; 4184} 4185 4186/// isScalarLoadToVector - Returns true if the node is a scalar load that 4187/// is promoted to a vector. It also returns the LoadSDNode by reference if 4188/// required. 4189static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4190 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4191 return false; 4192 N = N->getOperand(0).getNode(); 4193 if (!ISD::isNON_EXTLoad(N)) 4194 return false; 4195 if (LD) 4196 *LD = cast<LoadSDNode>(N); 4197 return true; 4198} 4199 4200// Test whether the given value is a vector value which will be legalized 4201// into a load. 4202static bool WillBeConstantPoolLoad(SDNode *N) { 4203 if (N->getOpcode() != ISD::BUILD_VECTOR) 4204 return false; 4205 4206 // Check for any non-constant elements. 4207 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4208 switch (N->getOperand(i).getNode()->getOpcode()) { 4209 case ISD::UNDEF: 4210 case ISD::ConstantFP: 4211 case ISD::Constant: 4212 break; 4213 default: 4214 return false; 4215 } 4216 4217 // Vectors of all-zeros and all-ones are materialized with special 4218 // instructions rather than being loaded. 4219 return !ISD::isBuildVectorAllZeros(N) && 4220 !ISD::isBuildVectorAllOnes(N); 4221} 4222 4223/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4224/// match movlp{s|d}. The lower half elements should come from lower half of 4225/// V1 (and in order), and the upper half elements should come from the upper 4226/// half of V2 (and in order). And since V1 will become the source of the 4227/// MOVLP, it must be either a vector load or a scalar load to vector. 4228static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4229 ArrayRef<int> Mask, EVT VT) { 4230 if (!VT.is128BitVector()) 4231 return false; 4232 4233 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4234 return false; 4235 // Is V2 is a vector load, don't do this transformation. We will try to use 4236 // load folding shufps op. 4237 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4238 return false; 4239 4240 unsigned NumElems = VT.getVectorNumElements(); 4241 4242 if (NumElems != 2 && NumElems != 4) 4243 return false; 4244 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4245 if (!isUndefOrEqual(Mask[i], i)) 4246 return false; 4247 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4248 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4249 return false; 4250 return true; 4251} 4252 4253/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4254/// all the same. 4255static bool isSplatVector(SDNode *N) { 4256 if (N->getOpcode() != ISD::BUILD_VECTOR) 4257 return false; 4258 4259 SDValue SplatValue = N->getOperand(0); 4260 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4261 if (N->getOperand(i) != SplatValue) 4262 return false; 4263 return true; 4264} 4265 4266/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4267/// to an zero vector. 4268/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4269static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4270 SDValue V1 = N->getOperand(0); 4271 SDValue V2 = N->getOperand(1); 4272 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4273 for (unsigned i = 0; i != NumElems; ++i) { 4274 int Idx = N->getMaskElt(i); 4275 if (Idx >= (int)NumElems) { 4276 unsigned Opc = V2.getOpcode(); 4277 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4278 continue; 4279 if (Opc != ISD::BUILD_VECTOR || 4280 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4281 return false; 4282 } else if (Idx >= 0) { 4283 unsigned Opc = V1.getOpcode(); 4284 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4285 continue; 4286 if (Opc != ISD::BUILD_VECTOR || 4287 !X86::isZeroNode(V1.getOperand(Idx))) 4288 return false; 4289 } 4290 } 4291 return true; 4292} 4293 4294/// getZeroVector - Returns a vector of specified type with all zero elements. 4295/// 4296static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4297 SelectionDAG &DAG, DebugLoc dl) { 4298 assert(VT.isVector() && "Expected a vector type"); 4299 unsigned Size = VT.getSizeInBits(); 4300 4301 // Always build SSE zero vectors as <4 x i32> bitcasted 4302 // to their dest type. This ensures they get CSE'd. 4303 SDValue Vec; 4304 if (Size == 128) { // SSE 4305 if (Subtarget->hasSSE2()) { // SSE2 4306 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4307 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4308 } else { // SSE1 4309 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4310 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4311 } 4312 } else if (Size == 256) { // AVX 4313 if (Subtarget->hasAVX2()) { // AVX2 4314 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4315 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4316 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4317 } else { 4318 // 256-bit logic and arithmetic instructions in AVX are all 4319 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4320 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4321 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4322 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4323 } 4324 } else 4325 llvm_unreachable("Unexpected vector type"); 4326 4327 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4328} 4329 4330/// getOnesVector - Returns a vector of specified type with all bits set. 4331/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4332/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4333/// Then bitcast to their original type, ensuring they get CSE'd. 4334static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, 4335 DebugLoc dl) { 4336 assert(VT.isVector() && "Expected a vector type"); 4337 unsigned Size = VT.getSizeInBits(); 4338 4339 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4340 SDValue Vec; 4341 if (Size == 256) { 4342 if (HasAVX2) { // AVX2 4343 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4344 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4345 } else { // AVX 4346 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4347 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4348 } 4349 } else if (Size == 128) { 4350 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4351 } else 4352 llvm_unreachable("Unexpected vector type"); 4353 4354 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4355} 4356 4357/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4358/// that point to V2 points to its first element. 4359static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4360 for (unsigned i = 0; i != NumElems; ++i) { 4361 if (Mask[i] > (int)NumElems) { 4362 Mask[i] = NumElems; 4363 } 4364 } 4365} 4366 4367/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4368/// operation of specified width. 4369static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4370 SDValue V2) { 4371 unsigned NumElems = VT.getVectorNumElements(); 4372 SmallVector<int, 8> Mask; 4373 Mask.push_back(NumElems); 4374 for (unsigned i = 1; i != NumElems; ++i) 4375 Mask.push_back(i); 4376 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4377} 4378 4379/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4380static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4381 SDValue V2) { 4382 unsigned NumElems = VT.getVectorNumElements(); 4383 SmallVector<int, 8> Mask; 4384 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4385 Mask.push_back(i); 4386 Mask.push_back(i + NumElems); 4387 } 4388 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4389} 4390 4391/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4392static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4393 SDValue V2) { 4394 unsigned NumElems = VT.getVectorNumElements(); 4395 SmallVector<int, 8> Mask; 4396 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4397 Mask.push_back(i + Half); 4398 Mask.push_back(i + NumElems + Half); 4399 } 4400 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4401} 4402 4403// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4404// a generic shuffle instruction because the target has no such instructions. 4405// Generate shuffles which repeat i16 and i8 several times until they can be 4406// represented by v4f32 and then be manipulated by target suported shuffles. 4407static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4408 EVT VT = V.getValueType(); 4409 int NumElems = VT.getVectorNumElements(); 4410 DebugLoc dl = V.getDebugLoc(); 4411 4412 while (NumElems > 4) { 4413 if (EltNo < NumElems/2) { 4414 V = getUnpackl(DAG, dl, VT, V, V); 4415 } else { 4416 V = getUnpackh(DAG, dl, VT, V, V); 4417 EltNo -= NumElems/2; 4418 } 4419 NumElems >>= 1; 4420 } 4421 return V; 4422} 4423 4424/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4425static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4426 EVT VT = V.getValueType(); 4427 DebugLoc dl = V.getDebugLoc(); 4428 unsigned Size = VT.getSizeInBits(); 4429 4430 if (Size == 128) { 4431 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4432 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4433 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4434 &SplatMask[0]); 4435 } else if (Size == 256) { 4436 // To use VPERMILPS to splat scalars, the second half of indicies must 4437 // refer to the higher part, which is a duplication of the lower one, 4438 // because VPERMILPS can only handle in-lane permutations. 4439 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4440 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4441 4442 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4443 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4444 &SplatMask[0]); 4445 } else 4446 llvm_unreachable("Vector size not supported"); 4447 4448 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4449} 4450 4451/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4452static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4453 EVT SrcVT = SV->getValueType(0); 4454 SDValue V1 = SV->getOperand(0); 4455 DebugLoc dl = SV->getDebugLoc(); 4456 4457 int EltNo = SV->getSplatIndex(); 4458 int NumElems = SrcVT.getVectorNumElements(); 4459 unsigned Size = SrcVT.getSizeInBits(); 4460 4461 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4462 "Unknown how to promote splat for type"); 4463 4464 // Extract the 128-bit part containing the splat element and update 4465 // the splat element index when it refers to the higher register. 4466 if (Size == 256) { 4467 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4468 if (EltNo >= NumElems/2) 4469 EltNo -= NumElems/2; 4470 } 4471 4472 // All i16 and i8 vector types can't be used directly by a generic shuffle 4473 // instruction because the target has no such instruction. Generate shuffles 4474 // which repeat i16 and i8 several times until they fit in i32, and then can 4475 // be manipulated by target suported shuffles. 4476 EVT EltVT = SrcVT.getVectorElementType(); 4477 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4478 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4479 4480 // Recreate the 256-bit vector and place the same 128-bit vector 4481 // into the low and high part. This is necessary because we want 4482 // to use VPERM* to shuffle the vectors 4483 if (Size == 256) { 4484 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4485 } 4486 4487 return getLegalSplat(DAG, V1, EltNo); 4488} 4489 4490/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4491/// vector of zero or undef vector. This produces a shuffle where the low 4492/// element of V2 is swizzled into the zero/undef vector, landing at element 4493/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4494static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4495 bool IsZero, 4496 const X86Subtarget *Subtarget, 4497 SelectionDAG &DAG) { 4498 EVT VT = V2.getValueType(); 4499 SDValue V1 = IsZero 4500 ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4501 unsigned NumElems = VT.getVectorNumElements(); 4502 SmallVector<int, 16> MaskVec; 4503 for (unsigned i = 0; i != NumElems; ++i) 4504 // If this is the insertion idx, put the low elt of V2 here. 4505 MaskVec.push_back(i == Idx ? NumElems : i); 4506 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4507} 4508 4509/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4510/// target specific opcode. Returns true if the Mask could be calculated. 4511/// Sets IsUnary to true if only uses one source. 4512static bool getTargetShuffleMask(SDNode *N, MVT VT, 4513 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4514 unsigned NumElems = VT.getVectorNumElements(); 4515 SDValue ImmN; 4516 4517 IsUnary = false; 4518 switch(N->getOpcode()) { 4519 case X86ISD::SHUFP: 4520 ImmN = N->getOperand(N->getNumOperands()-1); 4521 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4522 break; 4523 case X86ISD::UNPCKH: 4524 DecodeUNPCKHMask(VT, Mask); 4525 break; 4526 case X86ISD::UNPCKL: 4527 DecodeUNPCKLMask(VT, Mask); 4528 break; 4529 case X86ISD::MOVHLPS: 4530 DecodeMOVHLPSMask(NumElems, Mask); 4531 break; 4532 case X86ISD::MOVLHPS: 4533 DecodeMOVLHPSMask(NumElems, Mask); 4534 break; 4535 case X86ISD::PSHUFD: 4536 case X86ISD::VPERMILP: 4537 ImmN = N->getOperand(N->getNumOperands()-1); 4538 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4539 IsUnary = true; 4540 break; 4541 case X86ISD::PSHUFHW: 4542 ImmN = N->getOperand(N->getNumOperands()-1); 4543 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4544 IsUnary = true; 4545 break; 4546 case X86ISD::PSHUFLW: 4547 ImmN = N->getOperand(N->getNumOperands()-1); 4548 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4549 IsUnary = true; 4550 break; 4551 case X86ISD::VPERMI: 4552 ImmN = N->getOperand(N->getNumOperands()-1); 4553 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4554 IsUnary = true; 4555 break; 4556 case X86ISD::MOVSS: 4557 case X86ISD::MOVSD: { 4558 // The index 0 always comes from the first element of the second source, 4559 // this is why MOVSS and MOVSD are used in the first place. The other 4560 // elements come from the other positions of the first source vector 4561 Mask.push_back(NumElems); 4562 for (unsigned i = 1; i != NumElems; ++i) { 4563 Mask.push_back(i); 4564 } 4565 break; 4566 } 4567 case X86ISD::VPERM2X128: 4568 ImmN = N->getOperand(N->getNumOperands()-1); 4569 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4570 if (Mask.empty()) return false; 4571 break; 4572 case X86ISD::MOVDDUP: 4573 case X86ISD::MOVLHPD: 4574 case X86ISD::MOVLPD: 4575 case X86ISD::MOVLPS: 4576 case X86ISD::MOVSHDUP: 4577 case X86ISD::MOVSLDUP: 4578 case X86ISD::PALIGN: 4579 // Not yet implemented 4580 return false; 4581 default: llvm_unreachable("unknown target shuffle node"); 4582 } 4583 4584 return true; 4585} 4586 4587/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4588/// element of the result of the vector shuffle. 4589static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 4590 unsigned Depth) { 4591 if (Depth == 6) 4592 return SDValue(); // Limit search depth. 4593 4594 SDValue V = SDValue(N, 0); 4595 EVT VT = V.getValueType(); 4596 unsigned Opcode = V.getOpcode(); 4597 4598 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4599 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4600 int Elt = SV->getMaskElt(Index); 4601 4602 if (Elt < 0) 4603 return DAG.getUNDEF(VT.getVectorElementType()); 4604 4605 unsigned NumElems = VT.getVectorNumElements(); 4606 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 4607 : SV->getOperand(1); 4608 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 4609 } 4610 4611 // Recurse into target specific vector shuffles to find scalars. 4612 if (isTargetShuffle(Opcode)) { 4613 MVT ShufVT = V.getValueType().getSimpleVT(); 4614 unsigned NumElems = ShufVT.getVectorNumElements(); 4615 SmallVector<int, 16> ShuffleMask; 4616 SDValue ImmN; 4617 bool IsUnary; 4618 4619 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 4620 return SDValue(); 4621 4622 int Elt = ShuffleMask[Index]; 4623 if (Elt < 0) 4624 return DAG.getUNDEF(ShufVT.getVectorElementType()); 4625 4626 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 4627 : N->getOperand(1); 4628 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 4629 Depth+1); 4630 } 4631 4632 // Actual nodes that may contain scalar elements 4633 if (Opcode == ISD::BITCAST) { 4634 V = V.getOperand(0); 4635 EVT SrcVT = V.getValueType(); 4636 unsigned NumElems = VT.getVectorNumElements(); 4637 4638 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4639 return SDValue(); 4640 } 4641 4642 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4643 return (Index == 0) ? V.getOperand(0) 4644 : DAG.getUNDEF(VT.getVectorElementType()); 4645 4646 if (V.getOpcode() == ISD::BUILD_VECTOR) 4647 return V.getOperand(Index); 4648 4649 return SDValue(); 4650} 4651 4652/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4653/// shuffle operation which come from a consecutively from a zero. The 4654/// search can start in two different directions, from left or right. 4655static 4656unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, 4657 bool ZerosFromLeft, SelectionDAG &DAG) { 4658 unsigned i; 4659 for (i = 0; i != NumElems; ++i) { 4660 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4661 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 4662 if (!(Elt.getNode() && 4663 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4664 break; 4665 } 4666 4667 return i; 4668} 4669 4670/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 4671/// correspond consecutively to elements from one of the vector operands, 4672/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4673static 4674bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 4675 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 4676 unsigned NumElems, unsigned &OpNum) { 4677 bool SeenV1 = false; 4678 bool SeenV2 = false; 4679 4680 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 4681 int Idx = SVOp->getMaskElt(i); 4682 // Ignore undef indicies 4683 if (Idx < 0) 4684 continue; 4685 4686 if (Idx < (int)NumElems) 4687 SeenV1 = true; 4688 else 4689 SeenV2 = true; 4690 4691 // Only accept consecutive elements from the same vector 4692 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4693 return false; 4694 } 4695 4696 OpNum = SeenV1 ? 0 : 1; 4697 return true; 4698} 4699 4700/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4701/// logical left shift of a vector. 4702static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4703 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4704 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4705 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4706 false /* check zeros from right */, DAG); 4707 unsigned OpSrc; 4708 4709 if (!NumZeros) 4710 return false; 4711 4712 // Considering the elements in the mask that are not consecutive zeros, 4713 // check if they consecutively come from only one of the source vectors. 4714 // 4715 // V1 = {X, A, B, C} 0 4716 // \ \ \ / 4717 // vector_shuffle V1, V2 <1, 2, 3, X> 4718 // 4719 if (!isShuffleMaskConsecutive(SVOp, 4720 0, // Mask Start Index 4721 NumElems-NumZeros, // Mask End Index(exclusive) 4722 NumZeros, // Where to start looking in the src vector 4723 NumElems, // Number of elements in vector 4724 OpSrc)) // Which source operand ? 4725 return false; 4726 4727 isLeft = false; 4728 ShAmt = NumZeros; 4729 ShVal = SVOp->getOperand(OpSrc); 4730 return true; 4731} 4732 4733/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4734/// logical left shift of a vector. 4735static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4736 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4737 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4738 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4739 true /* check zeros from left */, DAG); 4740 unsigned OpSrc; 4741 4742 if (!NumZeros) 4743 return false; 4744 4745 // Considering the elements in the mask that are not consecutive zeros, 4746 // check if they consecutively come from only one of the source vectors. 4747 // 4748 // 0 { A, B, X, X } = V2 4749 // / \ / / 4750 // vector_shuffle V1, V2 <X, X, 4, 5> 4751 // 4752 if (!isShuffleMaskConsecutive(SVOp, 4753 NumZeros, // Mask Start Index 4754 NumElems, // Mask End Index(exclusive) 4755 0, // Where to start looking in the src vector 4756 NumElems, // Number of elements in vector 4757 OpSrc)) // Which source operand ? 4758 return false; 4759 4760 isLeft = true; 4761 ShAmt = NumZeros; 4762 ShVal = SVOp->getOperand(OpSrc); 4763 return true; 4764} 4765 4766/// isVectorShift - Returns true if the shuffle can be implemented as a 4767/// logical left or right shift of a vector. 4768static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4769 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4770 // Although the logic below support any bitwidth size, there are no 4771 // shift instructions which handle more than 128-bit vectors. 4772 if (!SVOp->getValueType(0).is128BitVector()) 4773 return false; 4774 4775 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4776 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4777 return true; 4778 4779 return false; 4780} 4781 4782/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4783/// 4784static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4785 unsigned NumNonZero, unsigned NumZero, 4786 SelectionDAG &DAG, 4787 const X86Subtarget* Subtarget, 4788 const TargetLowering &TLI) { 4789 if (NumNonZero > 8) 4790 return SDValue(); 4791 4792 DebugLoc dl = Op.getDebugLoc(); 4793 SDValue V(0, 0); 4794 bool First = true; 4795 for (unsigned i = 0; i < 16; ++i) { 4796 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4797 if (ThisIsNonZero && First) { 4798 if (NumZero) 4799 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4800 else 4801 V = DAG.getUNDEF(MVT::v8i16); 4802 First = false; 4803 } 4804 4805 if ((i & 1) != 0) { 4806 SDValue ThisElt(0, 0), LastElt(0, 0); 4807 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4808 if (LastIsNonZero) { 4809 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4810 MVT::i16, Op.getOperand(i-1)); 4811 } 4812 if (ThisIsNonZero) { 4813 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4814 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4815 ThisElt, DAG.getConstant(8, MVT::i8)); 4816 if (LastIsNonZero) 4817 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4818 } else 4819 ThisElt = LastElt; 4820 4821 if (ThisElt.getNode()) 4822 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4823 DAG.getIntPtrConstant(i/2)); 4824 } 4825 } 4826 4827 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4828} 4829 4830/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4831/// 4832static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4833 unsigned NumNonZero, unsigned NumZero, 4834 SelectionDAG &DAG, 4835 const X86Subtarget* Subtarget, 4836 const TargetLowering &TLI) { 4837 if (NumNonZero > 4) 4838 return SDValue(); 4839 4840 DebugLoc dl = Op.getDebugLoc(); 4841 SDValue V(0, 0); 4842 bool First = true; 4843 for (unsigned i = 0; i < 8; ++i) { 4844 bool isNonZero = (NonZeros & (1 << i)) != 0; 4845 if (isNonZero) { 4846 if (First) { 4847 if (NumZero) 4848 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4849 else 4850 V = DAG.getUNDEF(MVT::v8i16); 4851 First = false; 4852 } 4853 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4854 MVT::v8i16, V, Op.getOperand(i), 4855 DAG.getIntPtrConstant(i)); 4856 } 4857 } 4858 4859 return V; 4860} 4861 4862/// getVShift - Return a vector logical shift node. 4863/// 4864static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4865 unsigned NumBits, SelectionDAG &DAG, 4866 const TargetLowering &TLI, DebugLoc dl) { 4867 assert(VT.is128BitVector() && "Unknown type for VShift"); 4868 EVT ShVT = MVT::v2i64; 4869 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4870 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4871 return DAG.getNode(ISD::BITCAST, dl, VT, 4872 DAG.getNode(Opc, dl, ShVT, SrcOp, 4873 DAG.getConstant(NumBits, 4874 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4875} 4876 4877SDValue 4878X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4879 SelectionDAG &DAG) const { 4880 4881 // Check if the scalar load can be widened into a vector load. And if 4882 // the address is "base + cst" see if the cst can be "absorbed" into 4883 // the shuffle mask. 4884 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4885 SDValue Ptr = LD->getBasePtr(); 4886 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4887 return SDValue(); 4888 EVT PVT = LD->getValueType(0); 4889 if (PVT != MVT::i32 && PVT != MVT::f32) 4890 return SDValue(); 4891 4892 int FI = -1; 4893 int64_t Offset = 0; 4894 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4895 FI = FINode->getIndex(); 4896 Offset = 0; 4897 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4898 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4899 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4900 Offset = Ptr.getConstantOperandVal(1); 4901 Ptr = Ptr.getOperand(0); 4902 } else { 4903 return SDValue(); 4904 } 4905 4906 // FIXME: 256-bit vector instructions don't require a strict alignment, 4907 // improve this code to support it better. 4908 unsigned RequiredAlign = VT.getSizeInBits()/8; 4909 SDValue Chain = LD->getChain(); 4910 // Make sure the stack object alignment is at least 16 or 32. 4911 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4912 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4913 if (MFI->isFixedObjectIndex(FI)) { 4914 // Can't change the alignment. FIXME: It's possible to compute 4915 // the exact stack offset and reference FI + adjust offset instead. 4916 // If someone *really* cares about this. That's the way to implement it. 4917 return SDValue(); 4918 } else { 4919 MFI->setObjectAlignment(FI, RequiredAlign); 4920 } 4921 } 4922 4923 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4924 // Ptr + (Offset & ~15). 4925 if (Offset < 0) 4926 return SDValue(); 4927 if ((Offset % RequiredAlign) & 3) 4928 return SDValue(); 4929 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4930 if (StartOffset) 4931 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4932 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4933 4934 int EltNo = (Offset - StartOffset) >> 2; 4935 unsigned NumElems = VT.getVectorNumElements(); 4936 4937 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4938 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4939 LD->getPointerInfo().getWithOffset(StartOffset), 4940 false, false, false, 0); 4941 4942 SmallVector<int, 8> Mask; 4943 for (unsigned i = 0; i != NumElems; ++i) 4944 Mask.push_back(EltNo); 4945 4946 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 4947 } 4948 4949 return SDValue(); 4950} 4951 4952/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4953/// vector of type 'VT', see if the elements can be replaced by a single large 4954/// load which has the same value as a build_vector whose operands are 'elts'. 4955/// 4956/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4957/// 4958/// FIXME: we'd also like to handle the case where the last elements are zero 4959/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4960/// There's even a handy isZeroNode for that purpose. 4961static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4962 DebugLoc &DL, SelectionDAG &DAG) { 4963 EVT EltVT = VT.getVectorElementType(); 4964 unsigned NumElems = Elts.size(); 4965 4966 LoadSDNode *LDBase = NULL; 4967 unsigned LastLoadedElt = -1U; 4968 4969 // For each element in the initializer, see if we've found a load or an undef. 4970 // If we don't find an initial load element, or later load elements are 4971 // non-consecutive, bail out. 4972 for (unsigned i = 0; i < NumElems; ++i) { 4973 SDValue Elt = Elts[i]; 4974 4975 if (!Elt.getNode() || 4976 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4977 return SDValue(); 4978 if (!LDBase) { 4979 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4980 return SDValue(); 4981 LDBase = cast<LoadSDNode>(Elt.getNode()); 4982 LastLoadedElt = i; 4983 continue; 4984 } 4985 if (Elt.getOpcode() == ISD::UNDEF) 4986 continue; 4987 4988 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4989 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4990 return SDValue(); 4991 LastLoadedElt = i; 4992 } 4993 4994 // If we have found an entire vector of loads and undefs, then return a large 4995 // load of the entire vector width starting at the base pointer. If we found 4996 // consecutive loads for the low half, generate a vzext_load node. 4997 if (LastLoadedElt == NumElems - 1) { 4998 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4999 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5000 LDBase->getPointerInfo(), 5001 LDBase->isVolatile(), LDBase->isNonTemporal(), 5002 LDBase->isInvariant(), 0); 5003 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5004 LDBase->getPointerInfo(), 5005 LDBase->isVolatile(), LDBase->isNonTemporal(), 5006 LDBase->isInvariant(), LDBase->getAlignment()); 5007 } 5008 if (NumElems == 4 && LastLoadedElt == 1 && 5009 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5010 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5011 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5012 SDValue ResNode = 5013 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 5014 LDBase->getPointerInfo(), 5015 LDBase->getAlignment(), 5016 false/*isVolatile*/, true/*ReadMem*/, 5017 false/*WriteMem*/); 5018 5019 // Make sure the newly-created LOAD is in the same position as LDBase in 5020 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5021 // update uses of LDBase's output chain to use the TokenFactor. 5022 if (LDBase->hasAnyUseOfValue(1)) { 5023 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5024 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5025 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5026 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5027 SDValue(ResNode.getNode(), 1)); 5028 } 5029 5030 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5031 } 5032 return SDValue(); 5033} 5034 5035/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5036/// to generate a splat value for the following cases: 5037/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5038/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5039/// a scalar load, or a constant. 5040/// The VBROADCAST node is returned when a pattern is found, 5041/// or SDValue() otherwise. 5042SDValue 5043X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { 5044 if (!Subtarget->hasAVX()) 5045 return SDValue(); 5046 5047 EVT VT = Op.getValueType(); 5048 DebugLoc dl = Op.getDebugLoc(); 5049 5050 assert((VT.is128BitVector() || VT.is256BitVector()) && 5051 "Unsupported vector type for broadcast."); 5052 5053 SDValue Ld; 5054 bool ConstSplatVal; 5055 5056 switch (Op.getOpcode()) { 5057 default: 5058 // Unknown pattern found. 5059 return SDValue(); 5060 5061 case ISD::BUILD_VECTOR: { 5062 // The BUILD_VECTOR node must be a splat. 5063 if (!isSplatVector(Op.getNode())) 5064 return SDValue(); 5065 5066 Ld = Op.getOperand(0); 5067 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5068 Ld.getOpcode() == ISD::ConstantFP); 5069 5070 // The suspected load node has several users. Make sure that all 5071 // of its users are from the BUILD_VECTOR node. 5072 // Constants may have multiple users. 5073 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5074 return SDValue(); 5075 break; 5076 } 5077 5078 case ISD::VECTOR_SHUFFLE: { 5079 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5080 5081 // Shuffles must have a splat mask where the first element is 5082 // broadcasted. 5083 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5084 return SDValue(); 5085 5086 SDValue Sc = Op.getOperand(0); 5087 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5088 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5089 5090 if (!Subtarget->hasAVX2()) 5091 return SDValue(); 5092 5093 // Use the register form of the broadcast instruction available on AVX2. 5094 if (VT.is256BitVector()) 5095 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5096 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5097 } 5098 5099 Ld = Sc.getOperand(0); 5100 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5101 Ld.getOpcode() == ISD::ConstantFP); 5102 5103 // The scalar_to_vector node and the suspected 5104 // load node must have exactly one user. 5105 // Constants may have multiple users. 5106 if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) 5107 return SDValue(); 5108 break; 5109 } 5110 } 5111 5112 bool Is256 = VT.is256BitVector(); 5113 5114 // Handle the broadcasting a single constant scalar from the constant pool 5115 // into a vector. On Sandybridge it is still better to load a constant vector 5116 // from the constant pool and not to broadcast it from a scalar. 5117 if (ConstSplatVal && Subtarget->hasAVX2()) { 5118 EVT CVT = Ld.getValueType(); 5119 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5120 unsigned ScalarSize = CVT.getSizeInBits(); 5121 5122 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { 5123 const Constant *C = 0; 5124 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5125 C = CI->getConstantIntValue(); 5126 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5127 C = CF->getConstantFPValue(); 5128 5129 assert(C && "Invalid constant type"); 5130 5131 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 5132 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5133 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5134 MachinePointerInfo::getConstantPool(), 5135 false, false, false, Alignment); 5136 5137 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5138 } 5139 } 5140 5141 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5142 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5143 5144 // Handle AVX2 in-register broadcasts. 5145 if (!IsLoad && Subtarget->hasAVX2() && 5146 (ScalarSize == 32 || (Is256 && ScalarSize == 64))) 5147 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5148 5149 // The scalar source must be a normal load. 5150 if (!IsLoad) 5151 return SDValue(); 5152 5153 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) 5154 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5155 5156 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5157 // double since there is no vbroadcastsd xmm 5158 if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { 5159 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5160 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5161 } 5162 5163 // Unsupported broadcast. 5164 return SDValue(); 5165} 5166 5167SDValue 5168X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5169 DebugLoc dl = Op.getDebugLoc(); 5170 5171 EVT VT = Op.getValueType(); 5172 EVT ExtVT = VT.getVectorElementType(); 5173 unsigned NumElems = Op.getNumOperands(); 5174 5175 // Vectors containing all zeros can be matched by pxor and xorps later 5176 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5177 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5178 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5179 if (VT == MVT::v4i32 || VT == MVT::v8i32) 5180 return Op; 5181 5182 return getZeroVector(VT, Subtarget, DAG, dl); 5183 } 5184 5185 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5186 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5187 // vpcmpeqd on 256-bit vectors. 5188 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5189 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) 5190 return Op; 5191 5192 return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); 5193 } 5194 5195 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 5196 if (Broadcast.getNode()) 5197 return Broadcast; 5198 5199 unsigned EVTBits = ExtVT.getSizeInBits(); 5200 5201 unsigned NumZero = 0; 5202 unsigned NumNonZero = 0; 5203 unsigned NonZeros = 0; 5204 bool IsAllConstants = true; 5205 SmallSet<SDValue, 8> Values; 5206 for (unsigned i = 0; i < NumElems; ++i) { 5207 SDValue Elt = Op.getOperand(i); 5208 if (Elt.getOpcode() == ISD::UNDEF) 5209 continue; 5210 Values.insert(Elt); 5211 if (Elt.getOpcode() != ISD::Constant && 5212 Elt.getOpcode() != ISD::ConstantFP) 5213 IsAllConstants = false; 5214 if (X86::isZeroNode(Elt)) 5215 NumZero++; 5216 else { 5217 NonZeros |= (1 << i); 5218 NumNonZero++; 5219 } 5220 } 5221 5222 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5223 if (NumNonZero == 0) 5224 return DAG.getUNDEF(VT); 5225 5226 // Special case for single non-zero, non-undef, element. 5227 if (NumNonZero == 1) { 5228 unsigned Idx = CountTrailingZeros_32(NonZeros); 5229 SDValue Item = Op.getOperand(Idx); 5230 5231 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5232 // the value are obviously zero, truncate the value to i32 and do the 5233 // insertion that way. Only do this if the value is non-constant or if the 5234 // value is a constant being inserted into element 0. It is cheaper to do 5235 // a constant pool load than it is to do a movd + shuffle. 5236 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5237 (!IsAllConstants || Idx == 0)) { 5238 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5239 // Handle SSE only. 5240 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5241 EVT VecVT = MVT::v4i32; 5242 unsigned VecElts = 4; 5243 5244 // Truncate the value (which may itself be a constant) to i32, and 5245 // convert it to a vector with movd (S2V+shuffle to zero extend). 5246 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5247 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5248 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5249 5250 // Now we have our 32-bit value zero extended in the low element of 5251 // a vector. If Idx != 0, swizzle it into place. 5252 if (Idx != 0) { 5253 SmallVector<int, 4> Mask; 5254 Mask.push_back(Idx); 5255 for (unsigned i = 1; i != VecElts; ++i) 5256 Mask.push_back(i); 5257 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5258 &Mask[0]); 5259 } 5260 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5261 } 5262 } 5263 5264 // If we have a constant or non-constant insertion into the low element of 5265 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5266 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5267 // depending on what the source datatype is. 5268 if (Idx == 0) { 5269 if (NumZero == 0) 5270 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5271 5272 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5273 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5274 if (VT.is256BitVector()) { 5275 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5276 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5277 Item, DAG.getIntPtrConstant(0)); 5278 } 5279 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5280 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5281 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5282 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5283 } 5284 5285 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5286 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5287 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5288 if (VT.is256BitVector()) { 5289 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5290 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5291 } else { 5292 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5293 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5294 } 5295 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5296 } 5297 } 5298 5299 // Is it a vector logical left shift? 5300 if (NumElems == 2 && Idx == 1 && 5301 X86::isZeroNode(Op.getOperand(0)) && 5302 !X86::isZeroNode(Op.getOperand(1))) { 5303 unsigned NumBits = VT.getSizeInBits(); 5304 return getVShift(true, VT, 5305 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5306 VT, Op.getOperand(1)), 5307 NumBits/2, DAG, *this, dl); 5308 } 5309 5310 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5311 return SDValue(); 5312 5313 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5314 // is a non-constant being inserted into an element other than the low one, 5315 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5316 // movd/movss) to move this into the low element, then shuffle it into 5317 // place. 5318 if (EVTBits == 32) { 5319 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5320 5321 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5322 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5323 SmallVector<int, 8> MaskVec; 5324 for (unsigned i = 0; i != NumElems; ++i) 5325 MaskVec.push_back(i == Idx ? 0 : 1); 5326 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5327 } 5328 } 5329 5330 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5331 if (Values.size() == 1) { 5332 if (EVTBits == 32) { 5333 // Instead of a shuffle like this: 5334 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5335 // Check if it's possible to issue this instead. 5336 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5337 unsigned Idx = CountTrailingZeros_32(NonZeros); 5338 SDValue Item = Op.getOperand(Idx); 5339 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5340 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5341 } 5342 return SDValue(); 5343 } 5344 5345 // A vector full of immediates; various special cases are already 5346 // handled, so this is best done with a single constant-pool load. 5347 if (IsAllConstants) 5348 return SDValue(); 5349 5350 // For AVX-length vectors, build the individual 128-bit pieces and use 5351 // shuffles to put them in place. 5352 if (VT.is256BitVector()) { 5353 SmallVector<SDValue, 32> V; 5354 for (unsigned i = 0; i != NumElems; ++i) 5355 V.push_back(Op.getOperand(i)); 5356 5357 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5358 5359 // Build both the lower and upper subvector. 5360 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5361 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5362 NumElems/2); 5363 5364 // Recreate the wider vector with the lower and upper part. 5365 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5366 } 5367 5368 // Let legalizer expand 2-wide build_vectors. 5369 if (EVTBits == 64) { 5370 if (NumNonZero == 1) { 5371 // One half is zero or undef. 5372 unsigned Idx = CountTrailingZeros_32(NonZeros); 5373 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5374 Op.getOperand(Idx)); 5375 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5376 } 5377 return SDValue(); 5378 } 5379 5380 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5381 if (EVTBits == 8 && NumElems == 16) { 5382 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5383 Subtarget, *this); 5384 if (V.getNode()) return V; 5385 } 5386 5387 if (EVTBits == 16 && NumElems == 8) { 5388 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5389 Subtarget, *this); 5390 if (V.getNode()) return V; 5391 } 5392 5393 // If element VT is == 32 bits, turn it into a number of shuffles. 5394 SmallVector<SDValue, 8> V(NumElems); 5395 if (NumElems == 4 && NumZero > 0) { 5396 for (unsigned i = 0; i < 4; ++i) { 5397 bool isZero = !(NonZeros & (1 << i)); 5398 if (isZero) 5399 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5400 else 5401 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5402 } 5403 5404 for (unsigned i = 0; i < 2; ++i) { 5405 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5406 default: break; 5407 case 0: 5408 V[i] = V[i*2]; // Must be a zero vector. 5409 break; 5410 case 1: 5411 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5412 break; 5413 case 2: 5414 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5415 break; 5416 case 3: 5417 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5418 break; 5419 } 5420 } 5421 5422 bool Reverse1 = (NonZeros & 0x3) == 2; 5423 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5424 int MaskVec[] = { 5425 Reverse1 ? 1 : 0, 5426 Reverse1 ? 0 : 1, 5427 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5428 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5429 }; 5430 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5431 } 5432 5433 if (Values.size() > 1 && VT.is128BitVector()) { 5434 // Check for a build vector of consecutive loads. 5435 for (unsigned i = 0; i < NumElems; ++i) 5436 V[i] = Op.getOperand(i); 5437 5438 // Check for elements which are consecutive loads. 5439 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5440 if (LD.getNode()) 5441 return LD; 5442 5443 // For SSE 4.1, use insertps to put the high elements into the low element. 5444 if (getSubtarget()->hasSSE41()) { 5445 SDValue Result; 5446 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5447 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5448 else 5449 Result = DAG.getUNDEF(VT); 5450 5451 for (unsigned i = 1; i < NumElems; ++i) { 5452 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5453 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5454 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5455 } 5456 return Result; 5457 } 5458 5459 // Otherwise, expand into a number of unpckl*, start by extending each of 5460 // our (non-undef) elements to the full vector width with the element in the 5461 // bottom slot of the vector (which generates no code for SSE). 5462 for (unsigned i = 0; i < NumElems; ++i) { 5463 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5464 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5465 else 5466 V[i] = DAG.getUNDEF(VT); 5467 } 5468 5469 // Next, we iteratively mix elements, e.g. for v4f32: 5470 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5471 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5472 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5473 unsigned EltStride = NumElems >> 1; 5474 while (EltStride != 0) { 5475 for (unsigned i = 0; i < EltStride; ++i) { 5476 // If V[i+EltStride] is undef and this is the first round of mixing, 5477 // then it is safe to just drop this shuffle: V[i] is already in the 5478 // right place, the one element (since it's the first round) being 5479 // inserted as undef can be dropped. This isn't safe for successive 5480 // rounds because they will permute elements within both vectors. 5481 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5482 EltStride == NumElems/2) 5483 continue; 5484 5485 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5486 } 5487 EltStride >>= 1; 5488 } 5489 return V[0]; 5490 } 5491 return SDValue(); 5492} 5493 5494// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5495// to create 256-bit vectors from two other 128-bit ones. 5496static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5497 DebugLoc dl = Op.getDebugLoc(); 5498 EVT ResVT = Op.getValueType(); 5499 5500 assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); 5501 5502 SDValue V1 = Op.getOperand(0); 5503 SDValue V2 = Op.getOperand(1); 5504 unsigned NumElems = ResVT.getVectorNumElements(); 5505 5506 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5507} 5508 5509static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5510 assert(Op.getNumOperands() == 2); 5511 5512 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5513 // from two other 128-bit ones. 5514 return LowerAVXCONCAT_VECTORS(Op, DAG); 5515} 5516 5517// Try to lower a shuffle node into a simple blend instruction. 5518static SDValue 5519LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 5520 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 5521 SDValue V1 = SVOp->getOperand(0); 5522 SDValue V2 = SVOp->getOperand(1); 5523 DebugLoc dl = SVOp->getDebugLoc(); 5524 MVT VT = SVOp->getValueType(0).getSimpleVT(); 5525 unsigned NumElems = VT.getVectorNumElements(); 5526 5527 if (!Subtarget->hasSSE41()) 5528 return SDValue(); 5529 5530 unsigned ISDNo = 0; 5531 MVT OpTy; 5532 5533 switch (VT.SimpleTy) { 5534 default: return SDValue(); 5535 case MVT::v8i16: 5536 ISDNo = X86ISD::BLENDPW; 5537 OpTy = MVT::v8i16; 5538 break; 5539 case MVT::v4i32: 5540 case MVT::v4f32: 5541 ISDNo = X86ISD::BLENDPS; 5542 OpTy = MVT::v4f32; 5543 break; 5544 case MVT::v2i64: 5545 case MVT::v2f64: 5546 ISDNo = X86ISD::BLENDPD; 5547 OpTy = MVT::v2f64; 5548 break; 5549 case MVT::v8i32: 5550 case MVT::v8f32: 5551 if (!Subtarget->hasAVX()) 5552 return SDValue(); 5553 ISDNo = X86ISD::BLENDPS; 5554 OpTy = MVT::v8f32; 5555 break; 5556 case MVT::v4i64: 5557 case MVT::v4f64: 5558 if (!Subtarget->hasAVX()) 5559 return SDValue(); 5560 ISDNo = X86ISD::BLENDPD; 5561 OpTy = MVT::v4f64; 5562 break; 5563 } 5564 assert(ISDNo && "Invalid Op Number"); 5565 5566 unsigned MaskVals = 0; 5567 5568 for (unsigned i = 0; i != NumElems; ++i) { 5569 int EltIdx = SVOp->getMaskElt(i); 5570 if (EltIdx == (int)i || EltIdx < 0) 5571 MaskVals |= (1<<i); 5572 else if (EltIdx == (int)(i + NumElems)) 5573 continue; // Bit is set to zero; 5574 else 5575 return SDValue(); 5576 } 5577 5578 V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); 5579 V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); 5580 SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, 5581 DAG.getConstant(MaskVals, MVT::i32)); 5582 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 5583} 5584 5585// v8i16 shuffles - Prefer shuffles in the following order: 5586// 1. [all] pshuflw, pshufhw, optional move 5587// 2. [ssse3] 1 x pshufb 5588// 3. [ssse3] 2 x pshufb + 1 x por 5589// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5590static SDValue 5591LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 5592 SelectionDAG &DAG) { 5593 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5594 SDValue V1 = SVOp->getOperand(0); 5595 SDValue V2 = SVOp->getOperand(1); 5596 DebugLoc dl = SVOp->getDebugLoc(); 5597 SmallVector<int, 8> MaskVals; 5598 5599 // Determine if more than 1 of the words in each of the low and high quadwords 5600 // of the result come from the same quadword of one of the two inputs. Undef 5601 // mask values count as coming from any quadword, for better codegen. 5602 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5603 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5604 std::bitset<4> InputQuads; 5605 for (unsigned i = 0; i < 8; ++i) { 5606 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5607 int EltIdx = SVOp->getMaskElt(i); 5608 MaskVals.push_back(EltIdx); 5609 if (EltIdx < 0) { 5610 ++Quad[0]; 5611 ++Quad[1]; 5612 ++Quad[2]; 5613 ++Quad[3]; 5614 continue; 5615 } 5616 ++Quad[EltIdx / 4]; 5617 InputQuads.set(EltIdx / 4); 5618 } 5619 5620 int BestLoQuad = -1; 5621 unsigned MaxQuad = 1; 5622 for (unsigned i = 0; i < 4; ++i) { 5623 if (LoQuad[i] > MaxQuad) { 5624 BestLoQuad = i; 5625 MaxQuad = LoQuad[i]; 5626 } 5627 } 5628 5629 int BestHiQuad = -1; 5630 MaxQuad = 1; 5631 for (unsigned i = 0; i < 4; ++i) { 5632 if (HiQuad[i] > MaxQuad) { 5633 BestHiQuad = i; 5634 MaxQuad = HiQuad[i]; 5635 } 5636 } 5637 5638 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5639 // of the two input vectors, shuffle them into one input vector so only a 5640 // single pshufb instruction is necessary. If There are more than 2 input 5641 // quads, disable the next transformation since it does not help SSSE3. 5642 bool V1Used = InputQuads[0] || InputQuads[1]; 5643 bool V2Used = InputQuads[2] || InputQuads[3]; 5644 if (Subtarget->hasSSSE3()) { 5645 if (InputQuads.count() == 2 && V1Used && V2Used) { 5646 BestLoQuad = InputQuads[0] ? 0 : 1; 5647 BestHiQuad = InputQuads[2] ? 2 : 3; 5648 } 5649 if (InputQuads.count() > 2) { 5650 BestLoQuad = -1; 5651 BestHiQuad = -1; 5652 } 5653 } 5654 5655 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5656 // the shuffle mask. If a quad is scored as -1, that means that it contains 5657 // words from all 4 input quadwords. 5658 SDValue NewV; 5659 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5660 int MaskV[] = { 5661 BestLoQuad < 0 ? 0 : BestLoQuad, 5662 BestHiQuad < 0 ? 1 : BestHiQuad 5663 }; 5664 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5665 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5666 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5667 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5668 5669 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5670 // source words for the shuffle, to aid later transformations. 5671 bool AllWordsInNewV = true; 5672 bool InOrder[2] = { true, true }; 5673 for (unsigned i = 0; i != 8; ++i) { 5674 int idx = MaskVals[i]; 5675 if (idx != (int)i) 5676 InOrder[i/4] = false; 5677 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5678 continue; 5679 AllWordsInNewV = false; 5680 break; 5681 } 5682 5683 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5684 if (AllWordsInNewV) { 5685 for (int i = 0; i != 8; ++i) { 5686 int idx = MaskVals[i]; 5687 if (idx < 0) 5688 continue; 5689 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5690 if ((idx != i) && idx < 4) 5691 pshufhw = false; 5692 if ((idx != i) && idx > 3) 5693 pshuflw = false; 5694 } 5695 V1 = NewV; 5696 V2Used = false; 5697 BestLoQuad = 0; 5698 BestHiQuad = 1; 5699 } 5700 5701 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5702 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5703 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5704 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5705 unsigned TargetMask = 0; 5706 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5707 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5708 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5709 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 5710 getShufflePSHUFLWImmediate(SVOp); 5711 V1 = NewV.getOperand(0); 5712 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5713 } 5714 } 5715 5716 // If we have SSSE3, and all words of the result are from 1 input vector, 5717 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5718 // is present, fall back to case 4. 5719 if (Subtarget->hasSSSE3()) { 5720 SmallVector<SDValue,16> pshufbMask; 5721 5722 // If we have elements from both input vectors, set the high bit of the 5723 // shuffle mask element to zero out elements that come from V2 in the V1 5724 // mask, and elements that come from V1 in the V2 mask, so that the two 5725 // results can be OR'd together. 5726 bool TwoInputs = V1Used && V2Used; 5727 for (unsigned i = 0; i != 8; ++i) { 5728 int EltIdx = MaskVals[i] * 2; 5729 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 5730 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 5731 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5732 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5733 } 5734 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5735 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5736 DAG.getNode(ISD::BUILD_VECTOR, dl, 5737 MVT::v16i8, &pshufbMask[0], 16)); 5738 if (!TwoInputs) 5739 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5740 5741 // Calculate the shuffle mask for the second input, shuffle it, and 5742 // OR it with the first shuffled input. 5743 pshufbMask.clear(); 5744 for (unsigned i = 0; i != 8; ++i) { 5745 int EltIdx = MaskVals[i] * 2; 5746 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5747 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 5748 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5749 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5750 } 5751 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5752 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5753 DAG.getNode(ISD::BUILD_VECTOR, dl, 5754 MVT::v16i8, &pshufbMask[0], 16)); 5755 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5756 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5757 } 5758 5759 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5760 // and update MaskVals with new element order. 5761 std::bitset<8> InOrder; 5762 if (BestLoQuad >= 0) { 5763 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 5764 for (int i = 0; i != 4; ++i) { 5765 int idx = MaskVals[i]; 5766 if (idx < 0) { 5767 InOrder.set(i); 5768 } else if ((idx / 4) == BestLoQuad) { 5769 MaskV[i] = idx & 3; 5770 InOrder.set(i); 5771 } 5772 } 5773 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5774 &MaskV[0]); 5775 5776 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5777 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5778 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5779 NewV.getOperand(0), 5780 getShufflePSHUFLWImmediate(SVOp), DAG); 5781 } 5782 } 5783 5784 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5785 // and update MaskVals with the new element order. 5786 if (BestHiQuad >= 0) { 5787 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 5788 for (unsigned i = 4; i != 8; ++i) { 5789 int idx = MaskVals[i]; 5790 if (idx < 0) { 5791 InOrder.set(i); 5792 } else if ((idx / 4) == BestHiQuad) { 5793 MaskV[i] = (idx & 3) + 4; 5794 InOrder.set(i); 5795 } 5796 } 5797 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5798 &MaskV[0]); 5799 5800 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5801 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5802 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5803 NewV.getOperand(0), 5804 getShufflePSHUFHWImmediate(SVOp), DAG); 5805 } 5806 } 5807 5808 // In case BestHi & BestLo were both -1, which means each quadword has a word 5809 // from each of the four input quadwords, calculate the InOrder bitvector now 5810 // before falling through to the insert/extract cleanup. 5811 if (BestLoQuad == -1 && BestHiQuad == -1) { 5812 NewV = V1; 5813 for (int i = 0; i != 8; ++i) 5814 if (MaskVals[i] < 0 || MaskVals[i] == i) 5815 InOrder.set(i); 5816 } 5817 5818 // The other elements are put in the right place using pextrw and pinsrw. 5819 for (unsigned i = 0; i != 8; ++i) { 5820 if (InOrder[i]) 5821 continue; 5822 int EltIdx = MaskVals[i]; 5823 if (EltIdx < 0) 5824 continue; 5825 SDValue ExtOp = (EltIdx < 8) ? 5826 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5827 DAG.getIntPtrConstant(EltIdx)) : 5828 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5829 DAG.getIntPtrConstant(EltIdx - 8)); 5830 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5831 DAG.getIntPtrConstant(i)); 5832 } 5833 return NewV; 5834} 5835 5836// v16i8 shuffles - Prefer shuffles in the following order: 5837// 1. [ssse3] 1 x pshufb 5838// 2. [ssse3] 2 x pshufb + 1 x por 5839// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5840static 5841SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5842 SelectionDAG &DAG, 5843 const X86TargetLowering &TLI) { 5844 SDValue V1 = SVOp->getOperand(0); 5845 SDValue V2 = SVOp->getOperand(1); 5846 DebugLoc dl = SVOp->getDebugLoc(); 5847 ArrayRef<int> MaskVals = SVOp->getMask(); 5848 5849 // If we have SSSE3, case 1 is generated when all result bytes come from 5850 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5851 // present, fall back to case 3. 5852 5853 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5854 if (TLI.getSubtarget()->hasSSSE3()) { 5855 SmallVector<SDValue,16> pshufbMask; 5856 5857 // If all result elements are from one input vector, then only translate 5858 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5859 // 5860 // Otherwise, we have elements from both input vectors, and must zero out 5861 // elements that come from V2 in the first mask, and V1 in the second mask 5862 // so that we can OR them together. 5863 for (unsigned i = 0; i != 16; ++i) { 5864 int EltIdx = MaskVals[i]; 5865 if (EltIdx < 0 || EltIdx >= 16) 5866 EltIdx = 0x80; 5867 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5868 } 5869 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5870 DAG.getNode(ISD::BUILD_VECTOR, dl, 5871 MVT::v16i8, &pshufbMask[0], 16)); 5872 5873 // As PSHUFB will zero elements with negative indices, it's safe to ignore 5874 // the 2nd operand if it's undefined or zero. 5875 if (V2.getOpcode() == ISD::UNDEF || 5876 ISD::isBuildVectorAllZeros(V2.getNode())) 5877 return V1; 5878 5879 // Calculate the shuffle mask for the second input, shuffle it, and 5880 // OR it with the first shuffled input. 5881 pshufbMask.clear(); 5882 for (unsigned i = 0; i != 16; ++i) { 5883 int EltIdx = MaskVals[i]; 5884 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5885 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5886 } 5887 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5888 DAG.getNode(ISD::BUILD_VECTOR, dl, 5889 MVT::v16i8, &pshufbMask[0], 16)); 5890 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5891 } 5892 5893 // No SSSE3 - Calculate in place words and then fix all out of place words 5894 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5895 // the 16 different words that comprise the two doublequadword input vectors. 5896 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5897 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5898 SDValue NewV = V1; 5899 for (int i = 0; i != 8; ++i) { 5900 int Elt0 = MaskVals[i*2]; 5901 int Elt1 = MaskVals[i*2+1]; 5902 5903 // This word of the result is all undef, skip it. 5904 if (Elt0 < 0 && Elt1 < 0) 5905 continue; 5906 5907 // This word of the result is already in the correct place, skip it. 5908 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 5909 continue; 5910 5911 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5912 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5913 SDValue InsElt; 5914 5915 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5916 // using a single extract together, load it and store it. 5917 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5918 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5919 DAG.getIntPtrConstant(Elt1 / 2)); 5920 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5921 DAG.getIntPtrConstant(i)); 5922 continue; 5923 } 5924 5925 // If Elt1 is defined, extract it from the appropriate source. If the 5926 // source byte is not also odd, shift the extracted word left 8 bits 5927 // otherwise clear the bottom 8 bits if we need to do an or. 5928 if (Elt1 >= 0) { 5929 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5930 DAG.getIntPtrConstant(Elt1 / 2)); 5931 if ((Elt1 & 1) == 0) 5932 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5933 DAG.getConstant(8, 5934 TLI.getShiftAmountTy(InsElt.getValueType()))); 5935 else if (Elt0 >= 0) 5936 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5937 DAG.getConstant(0xFF00, MVT::i16)); 5938 } 5939 // If Elt0 is defined, extract it from the appropriate source. If the 5940 // source byte is not also even, shift the extracted word right 8 bits. If 5941 // Elt1 was also defined, OR the extracted values together before 5942 // inserting them in the result. 5943 if (Elt0 >= 0) { 5944 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5945 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5946 if ((Elt0 & 1) != 0) 5947 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5948 DAG.getConstant(8, 5949 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5950 else if (Elt1 >= 0) 5951 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5952 DAG.getConstant(0x00FF, MVT::i16)); 5953 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5954 : InsElt0; 5955 } 5956 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5957 DAG.getIntPtrConstant(i)); 5958 } 5959 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5960} 5961 5962// v32i8 shuffles - Translate to VPSHUFB if possible. 5963static 5964SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 5965 const X86Subtarget *Subtarget, 5966 SelectionDAG &DAG) { 5967 EVT VT = SVOp->getValueType(0); 5968 SDValue V1 = SVOp->getOperand(0); 5969 SDValue V2 = SVOp->getOperand(1); 5970 DebugLoc dl = SVOp->getDebugLoc(); 5971 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 5972 5973 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5974 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 5975 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 5976 5977 // VPSHUFB may be generated if 5978 // (1) one of input vector is undefined or zeroinitializer. 5979 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 5980 // And (2) the mask indexes don't cross the 128-bit lane. 5981 if (VT != MVT::v32i8 || !Subtarget->hasAVX2() || 5982 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 5983 return SDValue(); 5984 5985 if (V1IsAllZero && !V2IsAllZero) { 5986 CommuteVectorShuffleMask(MaskVals, 32); 5987 V1 = V2; 5988 } 5989 SmallVector<SDValue, 32> pshufbMask; 5990 for (unsigned i = 0; i != 32; i++) { 5991 int EltIdx = MaskVals[i]; 5992 if (EltIdx < 0 || EltIdx >= 32) 5993 EltIdx = 0x80; 5994 else { 5995 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 5996 // Cross lane is not allowed. 5997 return SDValue(); 5998 EltIdx &= 0xf; 5999 } 6000 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6001 } 6002 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6003 DAG.getNode(ISD::BUILD_VECTOR, dl, 6004 MVT::v32i8, &pshufbMask[0], 32)); 6005} 6006 6007/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6008/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6009/// done when every pair / quad of shuffle mask elements point to elements in 6010/// the right sequence. e.g. 6011/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6012static 6013SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6014 SelectionDAG &DAG, DebugLoc dl) { 6015 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6016 unsigned NumElems = VT.getVectorNumElements(); 6017 MVT NewVT; 6018 unsigned Scale; 6019 switch (VT.SimpleTy) { 6020 default: llvm_unreachable("Unexpected!"); 6021 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6022 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6023 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6024 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6025 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6026 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6027 } 6028 6029 SmallVector<int, 8> MaskVec; 6030 for (unsigned i = 0; i != NumElems; i += Scale) { 6031 int StartIdx = -1; 6032 for (unsigned j = 0; j != Scale; ++j) { 6033 int EltIdx = SVOp->getMaskElt(i+j); 6034 if (EltIdx < 0) 6035 continue; 6036 if (StartIdx < 0) 6037 StartIdx = (EltIdx / Scale); 6038 if (EltIdx != (int)(StartIdx*Scale + j)) 6039 return SDValue(); 6040 } 6041 MaskVec.push_back(StartIdx); 6042 } 6043 6044 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6045 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6046 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6047} 6048 6049/// getVZextMovL - Return a zero-extending vector move low node. 6050/// 6051static SDValue getVZextMovL(EVT VT, EVT OpVT, 6052 SDValue SrcOp, SelectionDAG &DAG, 6053 const X86Subtarget *Subtarget, DebugLoc dl) { 6054 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6055 LoadSDNode *LD = NULL; 6056 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6057 LD = dyn_cast<LoadSDNode>(SrcOp); 6058 if (!LD) { 6059 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6060 // instead. 6061 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6062 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6063 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6064 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6065 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6066 // PR2108 6067 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6068 return DAG.getNode(ISD::BITCAST, dl, VT, 6069 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6070 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6071 OpVT, 6072 SrcOp.getOperand(0) 6073 .getOperand(0)))); 6074 } 6075 } 6076 } 6077 6078 return DAG.getNode(ISD::BITCAST, dl, VT, 6079 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6080 DAG.getNode(ISD::BITCAST, dl, 6081 OpVT, SrcOp))); 6082} 6083 6084/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6085/// which could not be matched by any known target speficic shuffle 6086static SDValue 6087LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6088 6089 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6090 if (NewOp.getNode()) 6091 return NewOp; 6092 6093 EVT VT = SVOp->getValueType(0); 6094 6095 unsigned NumElems = VT.getVectorNumElements(); 6096 unsigned NumLaneElems = NumElems / 2; 6097 6098 DebugLoc dl = SVOp->getDebugLoc(); 6099 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 6100 EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6101 SDValue Output[2]; 6102 6103 SmallVector<int, 16> Mask; 6104 for (unsigned l = 0; l < 2; ++l) { 6105 // Build a shuffle mask for the output, discovering on the fly which 6106 // input vectors to use as shuffle operands (recorded in InputUsed). 6107 // If building a suitable shuffle vector proves too hard, then bail 6108 // out with UseBuildVector set. 6109 bool UseBuildVector = false; 6110 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6111 unsigned LaneStart = l * NumLaneElems; 6112 for (unsigned i = 0; i != NumLaneElems; ++i) { 6113 // The mask element. This indexes into the input. 6114 int Idx = SVOp->getMaskElt(i+LaneStart); 6115 if (Idx < 0) { 6116 // the mask element does not index into any input vector. 6117 Mask.push_back(-1); 6118 continue; 6119 } 6120 6121 // The input vector this mask element indexes into. 6122 int Input = Idx / NumLaneElems; 6123 6124 // Turn the index into an offset from the start of the input vector. 6125 Idx -= Input * NumLaneElems; 6126 6127 // Find or create a shuffle vector operand to hold this input. 6128 unsigned OpNo; 6129 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6130 if (InputUsed[OpNo] == Input) 6131 // This input vector is already an operand. 6132 break; 6133 if (InputUsed[OpNo] < 0) { 6134 // Create a new operand for this input vector. 6135 InputUsed[OpNo] = Input; 6136 break; 6137 } 6138 } 6139 6140 if (OpNo >= array_lengthof(InputUsed)) { 6141 // More than two input vectors used! Give up on trying to create a 6142 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6143 UseBuildVector = true; 6144 break; 6145 } 6146 6147 // Add the mask index for the new shuffle vector. 6148 Mask.push_back(Idx + OpNo * NumLaneElems); 6149 } 6150 6151 if (UseBuildVector) { 6152 SmallVector<SDValue, 16> SVOps; 6153 for (unsigned i = 0; i != NumLaneElems; ++i) { 6154 // The mask element. This indexes into the input. 6155 int Idx = SVOp->getMaskElt(i+LaneStart); 6156 if (Idx < 0) { 6157 SVOps.push_back(DAG.getUNDEF(EltVT)); 6158 continue; 6159 } 6160 6161 // The input vector this mask element indexes into. 6162 int Input = Idx / NumElems; 6163 6164 // Turn the index into an offset from the start of the input vector. 6165 Idx -= Input * NumElems; 6166 6167 // Extract the vector element by hand. 6168 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6169 SVOp->getOperand(Input), 6170 DAG.getIntPtrConstant(Idx))); 6171 } 6172 6173 // Construct the output using a BUILD_VECTOR. 6174 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6175 SVOps.size()); 6176 } else if (InputUsed[0] < 0) { 6177 // No input vectors were used! The result is undefined. 6178 Output[l] = DAG.getUNDEF(NVT); 6179 } else { 6180 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6181 (InputUsed[0] % 2) * NumLaneElems, 6182 DAG, dl); 6183 // If only one input was used, use an undefined vector for the other. 6184 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6185 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6186 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6187 // At least one input vector was used. Create a new shuffle vector. 6188 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6189 } 6190 6191 Mask.clear(); 6192 } 6193 6194 // Concatenate the result back 6195 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6196} 6197 6198/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6199/// 4 elements, and match them with several different shuffle types. 6200static SDValue 6201LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6202 SDValue V1 = SVOp->getOperand(0); 6203 SDValue V2 = SVOp->getOperand(1); 6204 DebugLoc dl = SVOp->getDebugLoc(); 6205 EVT VT = SVOp->getValueType(0); 6206 6207 assert(VT.is128BitVector() && "Unsupported vector size"); 6208 6209 std::pair<int, int> Locs[4]; 6210 int Mask1[] = { -1, -1, -1, -1 }; 6211 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6212 6213 unsigned NumHi = 0; 6214 unsigned NumLo = 0; 6215 for (unsigned i = 0; i != 4; ++i) { 6216 int Idx = PermMask[i]; 6217 if (Idx < 0) { 6218 Locs[i] = std::make_pair(-1, -1); 6219 } else { 6220 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6221 if (Idx < 4) { 6222 Locs[i] = std::make_pair(0, NumLo); 6223 Mask1[NumLo] = Idx; 6224 NumLo++; 6225 } else { 6226 Locs[i] = std::make_pair(1, NumHi); 6227 if (2+NumHi < 4) 6228 Mask1[2+NumHi] = Idx; 6229 NumHi++; 6230 } 6231 } 6232 } 6233 6234 if (NumLo <= 2 && NumHi <= 2) { 6235 // If no more than two elements come from either vector. This can be 6236 // implemented with two shuffles. First shuffle gather the elements. 6237 // The second shuffle, which takes the first shuffle as both of its 6238 // vector operands, put the elements into the right order. 6239 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6240 6241 int Mask2[] = { -1, -1, -1, -1 }; 6242 6243 for (unsigned i = 0; i != 4; ++i) 6244 if (Locs[i].first != -1) { 6245 unsigned Idx = (i < 2) ? 0 : 4; 6246 Idx += Locs[i].first * 2 + Locs[i].second; 6247 Mask2[i] = Idx; 6248 } 6249 6250 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6251 } 6252 6253 if (NumLo == 3 || NumHi == 3) { 6254 // Otherwise, we must have three elements from one vector, call it X, and 6255 // one element from the other, call it Y. First, use a shufps to build an 6256 // intermediate vector with the one element from Y and the element from X 6257 // that will be in the same half in the final destination (the indexes don't 6258 // matter). Then, use a shufps to build the final vector, taking the half 6259 // containing the element from Y from the intermediate, and the other half 6260 // from X. 6261 if (NumHi == 3) { 6262 // Normalize it so the 3 elements come from V1. 6263 CommuteVectorShuffleMask(PermMask, 4); 6264 std::swap(V1, V2); 6265 } 6266 6267 // Find the element from V2. 6268 unsigned HiIndex; 6269 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6270 int Val = PermMask[HiIndex]; 6271 if (Val < 0) 6272 continue; 6273 if (Val >= 4) 6274 break; 6275 } 6276 6277 Mask1[0] = PermMask[HiIndex]; 6278 Mask1[1] = -1; 6279 Mask1[2] = PermMask[HiIndex^1]; 6280 Mask1[3] = -1; 6281 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6282 6283 if (HiIndex >= 2) { 6284 Mask1[0] = PermMask[0]; 6285 Mask1[1] = PermMask[1]; 6286 Mask1[2] = HiIndex & 1 ? 6 : 4; 6287 Mask1[3] = HiIndex & 1 ? 4 : 6; 6288 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6289 } 6290 6291 Mask1[0] = HiIndex & 1 ? 2 : 0; 6292 Mask1[1] = HiIndex & 1 ? 0 : 2; 6293 Mask1[2] = PermMask[2]; 6294 Mask1[3] = PermMask[3]; 6295 if (Mask1[2] >= 0) 6296 Mask1[2] += 4; 6297 if (Mask1[3] >= 0) 6298 Mask1[3] += 4; 6299 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6300 } 6301 6302 // Break it into (shuffle shuffle_hi, shuffle_lo). 6303 int LoMask[] = { -1, -1, -1, -1 }; 6304 int HiMask[] = { -1, -1, -1, -1 }; 6305 6306 int *MaskPtr = LoMask; 6307 unsigned MaskIdx = 0; 6308 unsigned LoIdx = 0; 6309 unsigned HiIdx = 2; 6310 for (unsigned i = 0; i != 4; ++i) { 6311 if (i == 2) { 6312 MaskPtr = HiMask; 6313 MaskIdx = 1; 6314 LoIdx = 0; 6315 HiIdx = 2; 6316 } 6317 int Idx = PermMask[i]; 6318 if (Idx < 0) { 6319 Locs[i] = std::make_pair(-1, -1); 6320 } else if (Idx < 4) { 6321 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6322 MaskPtr[LoIdx] = Idx; 6323 LoIdx++; 6324 } else { 6325 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6326 MaskPtr[HiIdx] = Idx; 6327 HiIdx++; 6328 } 6329 } 6330 6331 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6332 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6333 int MaskOps[] = { -1, -1, -1, -1 }; 6334 for (unsigned i = 0; i != 4; ++i) 6335 if (Locs[i].first != -1) 6336 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6337 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6338} 6339 6340static bool MayFoldVectorLoad(SDValue V) { 6341 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6342 V = V.getOperand(0); 6343 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6344 V = V.getOperand(0); 6345 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6346 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6347 // BUILD_VECTOR (load), undef 6348 V = V.getOperand(0); 6349 if (MayFoldLoad(V)) 6350 return true; 6351 return false; 6352} 6353 6354// FIXME: the version above should always be used. Since there's 6355// a bug where several vector shuffles can't be folded because the 6356// DAG is not updated during lowering and a node claims to have two 6357// uses while it only has one, use this version, and let isel match 6358// another instruction if the load really happens to have more than 6359// one use. Remove this version after this bug get fixed. 6360// rdar://8434668, PR8156 6361static bool RelaxedMayFoldVectorLoad(SDValue V) { 6362 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6363 V = V.getOperand(0); 6364 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6365 V = V.getOperand(0); 6366 if (ISD::isNormalLoad(V.getNode())) 6367 return true; 6368 return false; 6369} 6370 6371static 6372SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6373 EVT VT = Op.getValueType(); 6374 6375 // Canonizalize to v2f64. 6376 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6377 return DAG.getNode(ISD::BITCAST, dl, VT, 6378 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6379 V1, DAG)); 6380} 6381 6382static 6383SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6384 bool HasSSE2) { 6385 SDValue V1 = Op.getOperand(0); 6386 SDValue V2 = Op.getOperand(1); 6387 EVT VT = Op.getValueType(); 6388 6389 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6390 6391 if (HasSSE2 && VT == MVT::v2f64) 6392 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6393 6394 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6395 return DAG.getNode(ISD::BITCAST, dl, VT, 6396 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6397 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6398 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6399} 6400 6401static 6402SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6403 SDValue V1 = Op.getOperand(0); 6404 SDValue V2 = Op.getOperand(1); 6405 EVT VT = Op.getValueType(); 6406 6407 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6408 "unsupported shuffle type"); 6409 6410 if (V2.getOpcode() == ISD::UNDEF) 6411 V2 = V1; 6412 6413 // v4i32 or v4f32 6414 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6415} 6416 6417static 6418SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6419 SDValue V1 = Op.getOperand(0); 6420 SDValue V2 = Op.getOperand(1); 6421 EVT VT = Op.getValueType(); 6422 unsigned NumElems = VT.getVectorNumElements(); 6423 6424 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6425 // operand of these instructions is only memory, so check if there's a 6426 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6427 // same masks. 6428 bool CanFoldLoad = false; 6429 6430 // Trivial case, when V2 comes from a load. 6431 if (MayFoldVectorLoad(V2)) 6432 CanFoldLoad = true; 6433 6434 // When V1 is a load, it can be folded later into a store in isel, example: 6435 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6436 // turns into: 6437 // (MOVLPSmr addr:$src1, VR128:$src2) 6438 // So, recognize this potential and also use MOVLPS or MOVLPD 6439 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6440 CanFoldLoad = true; 6441 6442 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6443 if (CanFoldLoad) { 6444 if (HasSSE2 && NumElems == 2) 6445 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6446 6447 if (NumElems == 4) 6448 // If we don't care about the second element, proceed to use movss. 6449 if (SVOp->getMaskElt(1) != -1) 6450 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6451 } 6452 6453 // movl and movlp will both match v2i64, but v2i64 is never matched by 6454 // movl earlier because we make it strict to avoid messing with the movlp load 6455 // folding logic (see the code above getMOVLP call). Match it here then, 6456 // this is horrible, but will stay like this until we move all shuffle 6457 // matching to x86 specific nodes. Note that for the 1st condition all 6458 // types are matched with movsd. 6459 if (HasSSE2) { 6460 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6461 // as to remove this logic from here, as much as possible 6462 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 6463 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6464 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6465 } 6466 6467 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6468 6469 // Invert the operand order and use SHUFPS to match it. 6470 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 6471 getShuffleSHUFImmediate(SVOp), DAG); 6472} 6473 6474SDValue 6475X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { 6476 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6477 EVT VT = Op.getValueType(); 6478 DebugLoc dl = Op.getDebugLoc(); 6479 SDValue V1 = Op.getOperand(0); 6480 SDValue V2 = Op.getOperand(1); 6481 6482 if (isZeroShuffle(SVOp)) 6483 return getZeroVector(VT, Subtarget, DAG, dl); 6484 6485 // Handle splat operations 6486 if (SVOp->isSplat()) { 6487 unsigned NumElem = VT.getVectorNumElements(); 6488 int Size = VT.getSizeInBits(); 6489 6490 // Use vbroadcast whenever the splat comes from a foldable load 6491 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 6492 if (Broadcast.getNode()) 6493 return Broadcast; 6494 6495 // Handle splats by matching through known shuffle masks 6496 if ((Size == 128 && NumElem <= 4) || 6497 (Size == 256 && NumElem < 8)) 6498 return SDValue(); 6499 6500 // All remaning splats are promoted to target supported vector shuffles. 6501 return PromoteSplat(SVOp, DAG); 6502 } 6503 6504 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6505 // do it! 6506 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 6507 VT == MVT::v16i16 || VT == MVT::v32i8) { 6508 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6509 if (NewOp.getNode()) 6510 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6511 } else if ((VT == MVT::v4i32 || 6512 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6513 // FIXME: Figure out a cleaner way to do this. 6514 // Try to make use of movq to zero out the top part. 6515 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6516 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6517 if (NewOp.getNode()) { 6518 EVT NewVT = NewOp.getValueType(); 6519 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 6520 NewVT, true, false)) 6521 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 6522 DAG, Subtarget, dl); 6523 } 6524 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6525 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6526 if (NewOp.getNode()) { 6527 EVT NewVT = NewOp.getValueType(); 6528 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 6529 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 6530 DAG, Subtarget, dl); 6531 } 6532 } 6533 } 6534 return SDValue(); 6535} 6536 6537SDValue 6538X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6539 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6540 SDValue V1 = Op.getOperand(0); 6541 SDValue V2 = Op.getOperand(1); 6542 EVT VT = Op.getValueType(); 6543 DebugLoc dl = Op.getDebugLoc(); 6544 unsigned NumElems = VT.getVectorNumElements(); 6545 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6546 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6547 bool V1IsSplat = false; 6548 bool V2IsSplat = false; 6549 bool HasSSE2 = Subtarget->hasSSE2(); 6550 bool HasAVX = Subtarget->hasAVX(); 6551 bool HasAVX2 = Subtarget->hasAVX2(); 6552 MachineFunction &MF = DAG.getMachineFunction(); 6553 bool OptForSize = MF.getFunction()->getFnAttributes(). 6554 hasAttribute(Attributes::OptimizeForSize); 6555 6556 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6557 6558 if (V1IsUndef && V2IsUndef) 6559 return DAG.getUNDEF(VT); 6560 6561 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 6562 6563 // Vector shuffle lowering takes 3 steps: 6564 // 6565 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6566 // narrowing and commutation of operands should be handled. 6567 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6568 // shuffle nodes. 6569 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6570 // so the shuffle can be broken into other shuffles and the legalizer can 6571 // try the lowering again. 6572 // 6573 // The general idea is that no vector_shuffle operation should be left to 6574 // be matched during isel, all of them must be converted to a target specific 6575 // node here. 6576 6577 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6578 // narrowing and commutation of operands should be handled. The actual code 6579 // doesn't include all of those, work in progress... 6580 SDValue NewOp = NormalizeVectorShuffle(Op, DAG); 6581 if (NewOp.getNode()) 6582 return NewOp; 6583 6584 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 6585 6586 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6587 // unpckh_undef). Only use pshufd if speed is more important than size. 6588 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6589 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6590 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6591 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6592 6593 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 6594 V2IsUndef && RelaxedMayFoldVectorLoad(V1)) 6595 return getMOVDDup(Op, dl, V1, DAG); 6596 6597 if (isMOVHLPS_v_undef_Mask(M, VT)) 6598 return getMOVHighToLow(Op, dl, DAG); 6599 6600 // Use to match splats 6601 if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && 6602 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6603 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6604 6605 if (isPSHUFDMask(M, VT)) { 6606 // The actual implementation will match the mask in the if above and then 6607 // during isel it can match several different instructions, not only pshufd 6608 // as its name says, sad but true, emulate the behavior for now... 6609 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6610 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6611 6612 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 6613 6614 if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) 6615 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); 6616 6617 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6618 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6619 6620 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 6621 TargetMask, DAG); 6622 } 6623 6624 // Check if this can be converted into a logical shift. 6625 bool isLeft = false; 6626 unsigned ShAmt = 0; 6627 SDValue ShVal; 6628 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6629 if (isShift && ShVal.hasOneUse()) { 6630 // If the shifted value has multiple uses, it may be cheaper to use 6631 // v_set0 + movlhps or movhlps, etc. 6632 EVT EltVT = VT.getVectorElementType(); 6633 ShAmt *= EltVT.getSizeInBits(); 6634 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6635 } 6636 6637 if (isMOVLMask(M, VT)) { 6638 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6639 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6640 if (!isMOVLPMask(M, VT)) { 6641 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6642 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6643 6644 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6645 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6646 } 6647 } 6648 6649 // FIXME: fold these into legal mask. 6650 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) 6651 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6652 6653 if (isMOVHLPSMask(M, VT)) 6654 return getMOVHighToLow(Op, dl, DAG); 6655 6656 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 6657 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6658 6659 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 6660 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6661 6662 if (isMOVLPMask(M, VT)) 6663 return getMOVLP(Op, dl, DAG, HasSSE2); 6664 6665 if (ShouldXformToMOVHLPS(M, VT) || 6666 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 6667 return CommuteVectorShuffle(SVOp, DAG); 6668 6669 if (isShift) { 6670 // No better options. Use a vshldq / vsrldq. 6671 EVT EltVT = VT.getVectorElementType(); 6672 ShAmt *= EltVT.getSizeInBits(); 6673 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6674 } 6675 6676 bool Commuted = false; 6677 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6678 // 1,1,1,1 -> v8i16 though. 6679 V1IsSplat = isSplatVector(V1.getNode()); 6680 V2IsSplat = isSplatVector(V2.getNode()); 6681 6682 // Canonicalize the splat or undef, if present, to be on the RHS. 6683 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 6684 CommuteVectorShuffleMask(M, NumElems); 6685 std::swap(V1, V2); 6686 std::swap(V1IsSplat, V2IsSplat); 6687 Commuted = true; 6688 } 6689 6690 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6691 // Shuffling low element of v1 into undef, just return v1. 6692 if (V2IsUndef) 6693 return V1; 6694 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6695 // the instruction selector will not match, so get a canonical MOVL with 6696 // swapped operands to undo the commute. 6697 return getMOVL(DAG, dl, VT, V2, V1); 6698 } 6699 6700 if (isUNPCKLMask(M, VT, HasAVX2)) 6701 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6702 6703 if (isUNPCKHMask(M, VT, HasAVX2)) 6704 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6705 6706 if (V2IsSplat) { 6707 // Normalize mask so all entries that point to V2 points to its first 6708 // element then try to match unpck{h|l} again. If match, return a 6709 // new vector_shuffle with the corrected mask.p 6710 SmallVector<int, 8> NewMask(M.begin(), M.end()); 6711 NormalizeMask(NewMask, NumElems); 6712 if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) 6713 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6714 if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) 6715 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6716 } 6717 6718 if (Commuted) { 6719 // Commute is back and try unpck* again. 6720 // FIXME: this seems wrong. 6721 CommuteVectorShuffleMask(M, NumElems); 6722 std::swap(V1, V2); 6723 std::swap(V1IsSplat, V2IsSplat); 6724 Commuted = false; 6725 6726 if (isUNPCKLMask(M, VT, HasAVX2)) 6727 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6728 6729 if (isUNPCKHMask(M, VT, HasAVX2)) 6730 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6731 } 6732 6733 // Normalize the node to match x86 shuffle ops if needed 6734 if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) 6735 return CommuteVectorShuffle(SVOp, DAG); 6736 6737 // The checks below are all present in isShuffleMaskLegal, but they are 6738 // inlined here right now to enable us to directly emit target specific 6739 // nodes, and remove one by one until they don't return Op anymore. 6740 6741 if (isPALIGNRMask(M, VT, Subtarget)) 6742 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6743 getShufflePALIGNRImmediate(SVOp), 6744 DAG); 6745 6746 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6747 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6748 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6749 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6750 } 6751 6752 if (isPSHUFHWMask(M, VT, HasAVX2)) 6753 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6754 getShufflePSHUFHWImmediate(SVOp), 6755 DAG); 6756 6757 if (isPSHUFLWMask(M, VT, HasAVX2)) 6758 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6759 getShufflePSHUFLWImmediate(SVOp), 6760 DAG); 6761 6762 if (isSHUFPMask(M, VT, HasAVX)) 6763 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 6764 getShuffleSHUFImmediate(SVOp), DAG); 6765 6766 if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6767 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6768 if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6769 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6770 6771 //===--------------------------------------------------------------------===// 6772 // Generate target specific nodes for 128 or 256-bit shuffles only 6773 // supported in the AVX instruction set. 6774 // 6775 6776 // Handle VMOVDDUPY permutations 6777 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) 6778 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6779 6780 // Handle VPERMILPS/D* permutations 6781 if (isVPERMILPMask(M, VT, HasAVX)) { 6782 if (HasAVX2 && VT == MVT::v8i32) 6783 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 6784 getShuffleSHUFImmediate(SVOp), DAG); 6785 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 6786 getShuffleSHUFImmediate(SVOp), DAG); 6787 } 6788 6789 // Handle VPERM2F128/VPERM2I128 permutations 6790 if (isVPERM2X128Mask(M, VT, HasAVX)) 6791 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 6792 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 6793 6794 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 6795 if (BlendOp.getNode()) 6796 return BlendOp; 6797 6798 if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { 6799 SmallVector<SDValue, 8> permclMask; 6800 for (unsigned i = 0; i != 8; ++i) { 6801 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); 6802 } 6803 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, 6804 &permclMask[0], 8); 6805 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 6806 return DAG.getNode(X86ISD::VPERMV, dl, VT, 6807 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 6808 } 6809 6810 if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64)) 6811 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, 6812 getShuffleCLImmediate(SVOp), DAG); 6813 6814 6815 //===--------------------------------------------------------------------===// 6816 // Since no target specific shuffle was selected for this generic one, 6817 // lower it into other known shuffles. FIXME: this isn't true yet, but 6818 // this is the plan. 6819 // 6820 6821 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6822 if (VT == MVT::v8i16) { 6823 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 6824 if (NewOp.getNode()) 6825 return NewOp; 6826 } 6827 6828 if (VT == MVT::v16i8) { 6829 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6830 if (NewOp.getNode()) 6831 return NewOp; 6832 } 6833 6834 if (VT == MVT::v32i8) { 6835 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 6836 if (NewOp.getNode()) 6837 return NewOp; 6838 } 6839 6840 // Handle all 128-bit wide vectors with 4 elements, and match them with 6841 // several different shuffle types. 6842 if (NumElems == 4 && VT.is128BitVector()) 6843 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6844 6845 // Handle general 256-bit shuffles 6846 if (VT.is256BitVector()) 6847 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6848 6849 return SDValue(); 6850} 6851 6852SDValue 6853X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6854 SelectionDAG &DAG) const { 6855 EVT VT = Op.getValueType(); 6856 DebugLoc dl = Op.getDebugLoc(); 6857 6858 if (!Op.getOperand(0).getValueType().is128BitVector()) 6859 return SDValue(); 6860 6861 if (VT.getSizeInBits() == 8) { 6862 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6863 Op.getOperand(0), Op.getOperand(1)); 6864 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6865 DAG.getValueType(VT)); 6866 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6867 } 6868 6869 if (VT.getSizeInBits() == 16) { 6870 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6871 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6872 if (Idx == 0) 6873 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6874 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6875 DAG.getNode(ISD::BITCAST, dl, 6876 MVT::v4i32, 6877 Op.getOperand(0)), 6878 Op.getOperand(1))); 6879 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6880 Op.getOperand(0), Op.getOperand(1)); 6881 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6882 DAG.getValueType(VT)); 6883 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6884 } 6885 6886 if (VT == MVT::f32) { 6887 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6888 // the result back to FR32 register. It's only worth matching if the 6889 // result has a single use which is a store or a bitcast to i32. And in 6890 // the case of a store, it's not worth it if the index is a constant 0, 6891 // because a MOVSSmr can be used instead, which is smaller and faster. 6892 if (!Op.hasOneUse()) 6893 return SDValue(); 6894 SDNode *User = *Op.getNode()->use_begin(); 6895 if ((User->getOpcode() != ISD::STORE || 6896 (isa<ConstantSDNode>(Op.getOperand(1)) && 6897 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6898 (User->getOpcode() != ISD::BITCAST || 6899 User->getValueType(0) != MVT::i32)) 6900 return SDValue(); 6901 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6902 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6903 Op.getOperand(0)), 6904 Op.getOperand(1)); 6905 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6906 } 6907 6908 if (VT == MVT::i32 || VT == MVT::i64) { 6909 // ExtractPS/pextrq works with constant index. 6910 if (isa<ConstantSDNode>(Op.getOperand(1))) 6911 return Op; 6912 } 6913 return SDValue(); 6914} 6915 6916 6917SDValue 6918X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6919 SelectionDAG &DAG) const { 6920 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6921 return SDValue(); 6922 6923 SDValue Vec = Op.getOperand(0); 6924 EVT VecVT = Vec.getValueType(); 6925 6926 // If this is a 256-bit vector result, first extract the 128-bit vector and 6927 // then extract the element from the 128-bit vector. 6928 if (VecVT.is256BitVector()) { 6929 DebugLoc dl = Op.getNode()->getDebugLoc(); 6930 unsigned NumElems = VecVT.getVectorNumElements(); 6931 SDValue Idx = Op.getOperand(1); 6932 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6933 6934 // Get the 128-bit vector. 6935 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 6936 6937 if (IdxVal >= NumElems/2) 6938 IdxVal -= NumElems/2; 6939 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6940 DAG.getConstant(IdxVal, MVT::i32)); 6941 } 6942 6943 assert(VecVT.is128BitVector() && "Unexpected vector length"); 6944 6945 if (Subtarget->hasSSE41()) { 6946 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6947 if (Res.getNode()) 6948 return Res; 6949 } 6950 6951 EVT VT = Op.getValueType(); 6952 DebugLoc dl = Op.getDebugLoc(); 6953 // TODO: handle v16i8. 6954 if (VT.getSizeInBits() == 16) { 6955 SDValue Vec = Op.getOperand(0); 6956 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6957 if (Idx == 0) 6958 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6959 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6960 DAG.getNode(ISD::BITCAST, dl, 6961 MVT::v4i32, Vec), 6962 Op.getOperand(1))); 6963 // Transform it so it match pextrw which produces a 32-bit result. 6964 EVT EltVT = MVT::i32; 6965 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6966 Op.getOperand(0), Op.getOperand(1)); 6967 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6968 DAG.getValueType(VT)); 6969 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6970 } 6971 6972 if (VT.getSizeInBits() == 32) { 6973 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6974 if (Idx == 0) 6975 return Op; 6976 6977 // SHUFPS the element to the lowest double word, then movss. 6978 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6979 EVT VVT = Op.getOperand(0).getValueType(); 6980 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6981 DAG.getUNDEF(VVT), Mask); 6982 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6983 DAG.getIntPtrConstant(0)); 6984 } 6985 6986 if (VT.getSizeInBits() == 64) { 6987 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6988 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6989 // to match extract_elt for f64. 6990 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6991 if (Idx == 0) 6992 return Op; 6993 6994 // UNPCKHPD the element to the lowest double word, then movsd. 6995 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6996 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6997 int Mask[2] = { 1, -1 }; 6998 EVT VVT = Op.getOperand(0).getValueType(); 6999 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7000 DAG.getUNDEF(VVT), Mask); 7001 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7002 DAG.getIntPtrConstant(0)); 7003 } 7004 7005 return SDValue(); 7006} 7007 7008SDValue 7009X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 7010 SelectionDAG &DAG) const { 7011 EVT VT = Op.getValueType(); 7012 EVT EltVT = VT.getVectorElementType(); 7013 DebugLoc dl = Op.getDebugLoc(); 7014 7015 SDValue N0 = Op.getOperand(0); 7016 SDValue N1 = Op.getOperand(1); 7017 SDValue N2 = Op.getOperand(2); 7018 7019 if (!VT.is128BitVector()) 7020 return SDValue(); 7021 7022 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7023 isa<ConstantSDNode>(N2)) { 7024 unsigned Opc; 7025 if (VT == MVT::v8i16) 7026 Opc = X86ISD::PINSRW; 7027 else if (VT == MVT::v16i8) 7028 Opc = X86ISD::PINSRB; 7029 else 7030 Opc = X86ISD::PINSRB; 7031 7032 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7033 // argument. 7034 if (N1.getValueType() != MVT::i32) 7035 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7036 if (N2.getValueType() != MVT::i32) 7037 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7038 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7039 } 7040 7041 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7042 // Bits [7:6] of the constant are the source select. This will always be 7043 // zero here. The DAG Combiner may combine an extract_elt index into these 7044 // bits. For example (insert (extract, 3), 2) could be matched by putting 7045 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7046 // Bits [5:4] of the constant are the destination select. This is the 7047 // value of the incoming immediate. 7048 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7049 // combine either bitwise AND or insert of float 0.0 to set these bits. 7050 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7051 // Create this as a scalar to vector.. 7052 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7053 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7054 } 7055 7056 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7057 // PINSR* works with constant index. 7058 return Op; 7059 } 7060 return SDValue(); 7061} 7062 7063SDValue 7064X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7065 EVT VT = Op.getValueType(); 7066 EVT EltVT = VT.getVectorElementType(); 7067 7068 DebugLoc dl = Op.getDebugLoc(); 7069 SDValue N0 = Op.getOperand(0); 7070 SDValue N1 = Op.getOperand(1); 7071 SDValue N2 = Op.getOperand(2); 7072 7073 // If this is a 256-bit vector result, first extract the 128-bit vector, 7074 // insert the element into the extracted half and then place it back. 7075 if (VT.is256BitVector()) { 7076 if (!isa<ConstantSDNode>(N2)) 7077 return SDValue(); 7078 7079 // Get the desired 128-bit vector half. 7080 unsigned NumElems = VT.getVectorNumElements(); 7081 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7082 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7083 7084 // Insert the element into the desired half. 7085 bool Upper = IdxVal >= NumElems/2; 7086 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7087 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); 7088 7089 // Insert the changed part back to the 256-bit vector 7090 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7091 } 7092 7093 if (Subtarget->hasSSE41()) 7094 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7095 7096 if (EltVT == MVT::i8) 7097 return SDValue(); 7098 7099 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7100 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7101 // as its second argument. 7102 if (N1.getValueType() != MVT::i32) 7103 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7104 if (N2.getValueType() != MVT::i32) 7105 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7106 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7107 } 7108 return SDValue(); 7109} 7110 7111static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7112 LLVMContext *Context = DAG.getContext(); 7113 DebugLoc dl = Op.getDebugLoc(); 7114 EVT OpVT = Op.getValueType(); 7115 7116 // If this is a 256-bit vector result, first insert into a 128-bit 7117 // vector and then insert into the 256-bit vector. 7118 if (!OpVT.is128BitVector()) { 7119 // Insert into a 128-bit vector. 7120 EVT VT128 = EVT::getVectorVT(*Context, 7121 OpVT.getVectorElementType(), 7122 OpVT.getVectorNumElements() / 2); 7123 7124 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7125 7126 // Insert the 128-bit vector. 7127 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7128 } 7129 7130 if (OpVT == MVT::v1i64 && 7131 Op.getOperand(0).getValueType() == MVT::i64) 7132 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7133 7134 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7135 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7136 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7137 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7138} 7139 7140// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7141// a simple subregister reference or explicit instructions to grab 7142// upper bits of a vector. 7143static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7144 SelectionDAG &DAG) { 7145 if (Subtarget->hasAVX()) { 7146 DebugLoc dl = Op.getNode()->getDebugLoc(); 7147 SDValue Vec = Op.getNode()->getOperand(0); 7148 SDValue Idx = Op.getNode()->getOperand(1); 7149 7150 if (Op.getNode()->getValueType(0).is128BitVector() && 7151 Vec.getNode()->getValueType(0).is256BitVector() && 7152 isa<ConstantSDNode>(Idx)) { 7153 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7154 return Extract128BitVector(Vec, IdxVal, DAG, dl); 7155 } 7156 } 7157 return SDValue(); 7158} 7159 7160// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7161// simple superregister reference or explicit instructions to insert 7162// the upper bits of a vector. 7163static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7164 SelectionDAG &DAG) { 7165 if (Subtarget->hasAVX()) { 7166 DebugLoc dl = Op.getNode()->getDebugLoc(); 7167 SDValue Vec = Op.getNode()->getOperand(0); 7168 SDValue SubVec = Op.getNode()->getOperand(1); 7169 SDValue Idx = Op.getNode()->getOperand(2); 7170 7171 if (Op.getNode()->getValueType(0).is256BitVector() && 7172 SubVec.getNode()->getValueType(0).is128BitVector() && 7173 isa<ConstantSDNode>(Idx)) { 7174 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7175 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7176 } 7177 } 7178 return SDValue(); 7179} 7180 7181// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7182// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7183// one of the above mentioned nodes. It has to be wrapped because otherwise 7184// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7185// be used to form addressing mode. These wrapped nodes will be selected 7186// into MOV32ri. 7187SDValue 7188X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7189 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7190 7191 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7192 // global base reg. 7193 unsigned char OpFlag = 0; 7194 unsigned WrapperKind = X86ISD::Wrapper; 7195 CodeModel::Model M = getTargetMachine().getCodeModel(); 7196 7197 if (Subtarget->isPICStyleRIPRel() && 7198 (M == CodeModel::Small || M == CodeModel::Kernel)) 7199 WrapperKind = X86ISD::WrapperRIP; 7200 else if (Subtarget->isPICStyleGOT()) 7201 OpFlag = X86II::MO_GOTOFF; 7202 else if (Subtarget->isPICStyleStubPIC()) 7203 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7204 7205 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7206 CP->getAlignment(), 7207 CP->getOffset(), OpFlag); 7208 DebugLoc DL = CP->getDebugLoc(); 7209 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7210 // With PIC, the address is actually $g + Offset. 7211 if (OpFlag) { 7212 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7213 DAG.getNode(X86ISD::GlobalBaseReg, 7214 DebugLoc(), getPointerTy()), 7215 Result); 7216 } 7217 7218 return Result; 7219} 7220 7221SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7222 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7223 7224 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7225 // global base reg. 7226 unsigned char OpFlag = 0; 7227 unsigned WrapperKind = X86ISD::Wrapper; 7228 CodeModel::Model M = getTargetMachine().getCodeModel(); 7229 7230 if (Subtarget->isPICStyleRIPRel() && 7231 (M == CodeModel::Small || M == CodeModel::Kernel)) 7232 WrapperKind = X86ISD::WrapperRIP; 7233 else if (Subtarget->isPICStyleGOT()) 7234 OpFlag = X86II::MO_GOTOFF; 7235 else if (Subtarget->isPICStyleStubPIC()) 7236 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7237 7238 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7239 OpFlag); 7240 DebugLoc DL = JT->getDebugLoc(); 7241 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7242 7243 // With PIC, the address is actually $g + Offset. 7244 if (OpFlag) 7245 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7246 DAG.getNode(X86ISD::GlobalBaseReg, 7247 DebugLoc(), getPointerTy()), 7248 Result); 7249 7250 return Result; 7251} 7252 7253SDValue 7254X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7255 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7256 7257 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7258 // global base reg. 7259 unsigned char OpFlag = 0; 7260 unsigned WrapperKind = X86ISD::Wrapper; 7261 CodeModel::Model M = getTargetMachine().getCodeModel(); 7262 7263 if (Subtarget->isPICStyleRIPRel() && 7264 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7265 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7266 OpFlag = X86II::MO_GOTPCREL; 7267 WrapperKind = X86ISD::WrapperRIP; 7268 } else if (Subtarget->isPICStyleGOT()) { 7269 OpFlag = X86II::MO_GOT; 7270 } else if (Subtarget->isPICStyleStubPIC()) { 7271 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7272 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7273 OpFlag = X86II::MO_DARWIN_NONLAZY; 7274 } 7275 7276 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7277 7278 DebugLoc DL = Op.getDebugLoc(); 7279 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7280 7281 7282 // With PIC, the address is actually $g + Offset. 7283 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7284 !Subtarget->is64Bit()) { 7285 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7286 DAG.getNode(X86ISD::GlobalBaseReg, 7287 DebugLoc(), getPointerTy()), 7288 Result); 7289 } 7290 7291 // For symbols that require a load from a stub to get the address, emit the 7292 // load. 7293 if (isGlobalStubReference(OpFlag)) 7294 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7295 MachinePointerInfo::getGOT(), false, false, false, 0); 7296 7297 return Result; 7298} 7299 7300SDValue 7301X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7302 // Create the TargetBlockAddressAddress node. 7303 unsigned char OpFlags = 7304 Subtarget->ClassifyBlockAddressReference(); 7305 CodeModel::Model M = getTargetMachine().getCodeModel(); 7306 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7307 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 7308 DebugLoc dl = Op.getDebugLoc(); 7309 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 7310 OpFlags); 7311 7312 if (Subtarget->isPICStyleRIPRel() && 7313 (M == CodeModel::Small || M == CodeModel::Kernel)) 7314 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7315 else 7316 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7317 7318 // With PIC, the address is actually $g + Offset. 7319 if (isGlobalRelativeToPICBase(OpFlags)) { 7320 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7321 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7322 Result); 7323 } 7324 7325 return Result; 7326} 7327 7328SDValue 7329X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7330 int64_t Offset, 7331 SelectionDAG &DAG) const { 7332 // Create the TargetGlobalAddress node, folding in the constant 7333 // offset if it is legal. 7334 unsigned char OpFlags = 7335 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7336 CodeModel::Model M = getTargetMachine().getCodeModel(); 7337 SDValue Result; 7338 if (OpFlags == X86II::MO_NO_FLAG && 7339 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7340 // A direct static reference to a global. 7341 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7342 Offset = 0; 7343 } else { 7344 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7345 } 7346 7347 if (Subtarget->isPICStyleRIPRel() && 7348 (M == CodeModel::Small || M == CodeModel::Kernel)) 7349 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7350 else 7351 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7352 7353 // With PIC, the address is actually $g + Offset. 7354 if (isGlobalRelativeToPICBase(OpFlags)) { 7355 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7356 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7357 Result); 7358 } 7359 7360 // For globals that require a load from a stub to get the address, emit the 7361 // load. 7362 if (isGlobalStubReference(OpFlags)) 7363 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7364 MachinePointerInfo::getGOT(), false, false, false, 0); 7365 7366 // If there was a non-zero offset that we didn't fold, create an explicit 7367 // addition for it. 7368 if (Offset != 0) 7369 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7370 DAG.getConstant(Offset, getPointerTy())); 7371 7372 return Result; 7373} 7374 7375SDValue 7376X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7377 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7378 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7379 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7380} 7381 7382static SDValue 7383GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7384 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7385 unsigned char OperandFlags, bool LocalDynamic = false) { 7386 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7387 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7388 DebugLoc dl = GA->getDebugLoc(); 7389 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7390 GA->getValueType(0), 7391 GA->getOffset(), 7392 OperandFlags); 7393 7394 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 7395 : X86ISD::TLSADDR; 7396 7397 if (InFlag) { 7398 SDValue Ops[] = { Chain, TGA, *InFlag }; 7399 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); 7400 } else { 7401 SDValue Ops[] = { Chain, TGA }; 7402 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); 7403 } 7404 7405 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7406 MFI->setAdjustsStack(true); 7407 7408 SDValue Flag = Chain.getValue(1); 7409 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7410} 7411 7412// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7413static SDValue 7414LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7415 const EVT PtrVT) { 7416 SDValue InFlag; 7417 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7418 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7419 DAG.getNode(X86ISD::GlobalBaseReg, 7420 DebugLoc(), PtrVT), InFlag); 7421 InFlag = Chain.getValue(1); 7422 7423 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7424} 7425 7426// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7427static SDValue 7428LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7429 const EVT PtrVT) { 7430 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7431 X86::RAX, X86II::MO_TLSGD); 7432} 7433 7434static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 7435 SelectionDAG &DAG, 7436 const EVT PtrVT, 7437 bool is64Bit) { 7438 DebugLoc dl = GA->getDebugLoc(); 7439 7440 // Get the start address of the TLS block for this module. 7441 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 7442 .getInfo<X86MachineFunctionInfo>(); 7443 MFI->incNumLocalDynamicTLSAccesses(); 7444 7445 SDValue Base; 7446 if (is64Bit) { 7447 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 7448 X86II::MO_TLSLD, /*LocalDynamic=*/true); 7449 } else { 7450 SDValue InFlag; 7451 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7452 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); 7453 InFlag = Chain.getValue(1); 7454 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 7455 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 7456 } 7457 7458 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 7459 // of Base. 7460 7461 // Build x@dtpoff. 7462 unsigned char OperandFlags = X86II::MO_DTPOFF; 7463 unsigned WrapperKind = X86ISD::Wrapper; 7464 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7465 GA->getValueType(0), 7466 GA->getOffset(), OperandFlags); 7467 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7468 7469 // Add x@dtpoff with the base. 7470 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 7471} 7472 7473// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 7474static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7475 const EVT PtrVT, TLSModel::Model model, 7476 bool is64Bit, bool isPIC) { 7477 DebugLoc dl = GA->getDebugLoc(); 7478 7479 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7480 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7481 is64Bit ? 257 : 256)); 7482 7483 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7484 DAG.getIntPtrConstant(0), 7485 MachinePointerInfo(Ptr), 7486 false, false, false, 0); 7487 7488 unsigned char OperandFlags = 0; 7489 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7490 // initialexec. 7491 unsigned WrapperKind = X86ISD::Wrapper; 7492 if (model == TLSModel::LocalExec) { 7493 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7494 } else if (model == TLSModel::InitialExec) { 7495 if (is64Bit) { 7496 OperandFlags = X86II::MO_GOTTPOFF; 7497 WrapperKind = X86ISD::WrapperRIP; 7498 } else { 7499 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 7500 } 7501 } else { 7502 llvm_unreachable("Unexpected model"); 7503 } 7504 7505 // emit "addl x@ntpoff,%eax" (local exec) 7506 // or "addl x@indntpoff,%eax" (initial exec) 7507 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 7508 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7509 GA->getValueType(0), 7510 GA->getOffset(), OperandFlags); 7511 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7512 7513 if (model == TLSModel::InitialExec) { 7514 if (isPIC && !is64Bit) { 7515 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 7516 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), 7517 Offset); 7518 } 7519 7520 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7521 MachinePointerInfo::getGOT(), false, false, false, 7522 0); 7523 } 7524 7525 // The address of the thread local variable is the add of the thread 7526 // pointer with the offset of the variable. 7527 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7528} 7529 7530SDValue 7531X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7532 7533 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7534 const GlobalValue *GV = GA->getGlobal(); 7535 7536 if (Subtarget->isTargetELF()) { 7537 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 7538 7539 switch (model) { 7540 case TLSModel::GeneralDynamic: 7541 if (Subtarget->is64Bit()) 7542 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7543 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7544 case TLSModel::LocalDynamic: 7545 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 7546 Subtarget->is64Bit()); 7547 case TLSModel::InitialExec: 7548 case TLSModel::LocalExec: 7549 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7550 Subtarget->is64Bit(), 7551 getTargetMachine().getRelocationModel() == Reloc::PIC_); 7552 } 7553 llvm_unreachable("Unknown TLS model."); 7554 } 7555 7556 if (Subtarget->isTargetDarwin()) { 7557 // Darwin only has one model of TLS. Lower to that. 7558 unsigned char OpFlag = 0; 7559 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7560 X86ISD::WrapperRIP : X86ISD::Wrapper; 7561 7562 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7563 // global base reg. 7564 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7565 !Subtarget->is64Bit(); 7566 if (PIC32) 7567 OpFlag = X86II::MO_TLVP_PIC_BASE; 7568 else 7569 OpFlag = X86II::MO_TLVP; 7570 DebugLoc DL = Op.getDebugLoc(); 7571 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7572 GA->getValueType(0), 7573 GA->getOffset(), OpFlag); 7574 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7575 7576 // With PIC32, the address is actually $g + Offset. 7577 if (PIC32) 7578 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7579 DAG.getNode(X86ISD::GlobalBaseReg, 7580 DebugLoc(), getPointerTy()), 7581 Offset); 7582 7583 // Lowering the machine isd will make sure everything is in the right 7584 // location. 7585 SDValue Chain = DAG.getEntryNode(); 7586 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7587 SDValue Args[] = { Chain, Offset }; 7588 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7589 7590 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7591 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7592 MFI->setAdjustsStack(true); 7593 7594 // And our return value (tls address) is in the standard call return value 7595 // location. 7596 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7597 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 7598 Chain.getValue(1)); 7599 } 7600 7601 if (Subtarget->isTargetWindows()) { 7602 // Just use the implicit TLS architecture 7603 // Need to generate someting similar to: 7604 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 7605 // ; from TEB 7606 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 7607 // mov rcx, qword [rdx+rcx*8] 7608 // mov eax, .tls$:tlsvar 7609 // [rax+rcx] contains the address 7610 // Windows 64bit: gs:0x58 7611 // Windows 32bit: fs:__tls_array 7612 7613 // If GV is an alias then use the aliasee for determining 7614 // thread-localness. 7615 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7616 GV = GA->resolveAliasedGlobal(false); 7617 DebugLoc dl = GA->getDebugLoc(); 7618 SDValue Chain = DAG.getEntryNode(); 7619 7620 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 7621 // %gs:0x58 (64-bit). 7622 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 7623 ? Type::getInt8PtrTy(*DAG.getContext(), 7624 256) 7625 : Type::getInt32PtrTy(*DAG.getContext(), 7626 257)); 7627 7628 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, 7629 Subtarget->is64Bit() 7630 ? DAG.getIntPtrConstant(0x58) 7631 : DAG.getExternalSymbol("_tls_array", 7632 getPointerTy()), 7633 MachinePointerInfo(Ptr), 7634 false, false, false, 0); 7635 7636 // Load the _tls_index variable 7637 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 7638 if (Subtarget->is64Bit()) 7639 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 7640 IDX, MachinePointerInfo(), MVT::i32, 7641 false, false, 0); 7642 else 7643 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 7644 false, false, false, 0); 7645 7646 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 7647 getPointerTy()); 7648 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 7649 7650 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 7651 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 7652 false, false, false, 0); 7653 7654 // Get the offset of start of .tls section 7655 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7656 GA->getValueType(0), 7657 GA->getOffset(), X86II::MO_SECREL); 7658 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 7659 7660 // The address of the thread local variable is the add of the thread 7661 // pointer with the offset of the variable. 7662 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 7663 } 7664 7665 llvm_unreachable("TLS not implemented for this target."); 7666} 7667 7668 7669/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 7670/// and take a 2 x i32 value to shift plus a shift amount. 7671SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 7672 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7673 EVT VT = Op.getValueType(); 7674 unsigned VTBits = VT.getSizeInBits(); 7675 DebugLoc dl = Op.getDebugLoc(); 7676 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7677 SDValue ShOpLo = Op.getOperand(0); 7678 SDValue ShOpHi = Op.getOperand(1); 7679 SDValue ShAmt = Op.getOperand(2); 7680 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7681 DAG.getConstant(VTBits - 1, MVT::i8)) 7682 : DAG.getConstant(0, VT); 7683 7684 SDValue Tmp2, Tmp3; 7685 if (Op.getOpcode() == ISD::SHL_PARTS) { 7686 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7687 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7688 } else { 7689 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7690 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7691 } 7692 7693 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7694 DAG.getConstant(VTBits, MVT::i8)); 7695 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7696 AndNode, DAG.getConstant(0, MVT::i8)); 7697 7698 SDValue Hi, Lo; 7699 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7700 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7701 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7702 7703 if (Op.getOpcode() == ISD::SHL_PARTS) { 7704 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7705 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7706 } else { 7707 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7708 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7709 } 7710 7711 SDValue Ops[2] = { Lo, Hi }; 7712 return DAG.getMergeValues(Ops, 2, dl); 7713} 7714 7715SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7716 SelectionDAG &DAG) const { 7717 EVT SrcVT = Op.getOperand(0).getValueType(); 7718 7719 if (SrcVT.isVector()) 7720 return SDValue(); 7721 7722 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7723 "Unknown SINT_TO_FP to lower!"); 7724 7725 // These are really Legal; return the operand so the caller accepts it as 7726 // Legal. 7727 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7728 return Op; 7729 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7730 Subtarget->is64Bit()) { 7731 return Op; 7732 } 7733 7734 DebugLoc dl = Op.getDebugLoc(); 7735 unsigned Size = SrcVT.getSizeInBits()/8; 7736 MachineFunction &MF = DAG.getMachineFunction(); 7737 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7738 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7739 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7740 StackSlot, 7741 MachinePointerInfo::getFixedStack(SSFI), 7742 false, false, 0); 7743 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7744} 7745 7746SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7747 SDValue StackSlot, 7748 SelectionDAG &DAG) const { 7749 // Build the FILD 7750 DebugLoc DL = Op.getDebugLoc(); 7751 SDVTList Tys; 7752 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7753 if (useSSE) 7754 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7755 else 7756 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7757 7758 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7759 7760 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7761 MachineMemOperand *MMO; 7762 if (FI) { 7763 int SSFI = FI->getIndex(); 7764 MMO = 7765 DAG.getMachineFunction() 7766 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7767 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7768 } else { 7769 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7770 StackSlot = StackSlot.getOperand(1); 7771 } 7772 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7773 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7774 X86ISD::FILD, DL, 7775 Tys, Ops, array_lengthof(Ops), 7776 SrcVT, MMO); 7777 7778 if (useSSE) { 7779 Chain = Result.getValue(1); 7780 SDValue InFlag = Result.getValue(2); 7781 7782 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7783 // shouldn't be necessary except that RFP cannot be live across 7784 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7785 MachineFunction &MF = DAG.getMachineFunction(); 7786 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7787 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7788 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7789 Tys = DAG.getVTList(MVT::Other); 7790 SDValue Ops[] = { 7791 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7792 }; 7793 MachineMemOperand *MMO = 7794 DAG.getMachineFunction() 7795 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7796 MachineMemOperand::MOStore, SSFISize, SSFISize); 7797 7798 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7799 Ops, array_lengthof(Ops), 7800 Op.getValueType(), MMO); 7801 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7802 MachinePointerInfo::getFixedStack(SSFI), 7803 false, false, false, 0); 7804 } 7805 7806 return Result; 7807} 7808 7809// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7810SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7811 SelectionDAG &DAG) const { 7812 // This algorithm is not obvious. Here it is what we're trying to output: 7813 /* 7814 movq %rax, %xmm0 7815 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 7816 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 7817 #ifdef __SSE3__ 7818 haddpd %xmm0, %xmm0 7819 #else 7820 pshufd $0x4e, %xmm0, %xmm1 7821 addpd %xmm1, %xmm0 7822 #endif 7823 */ 7824 7825 DebugLoc dl = Op.getDebugLoc(); 7826 LLVMContext *Context = DAG.getContext(); 7827 7828 // Build some magic constants. 7829 const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 7830 Constant *C0 = ConstantDataVector::get(*Context, CV0); 7831 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7832 7833 SmallVector<Constant*,2> CV1; 7834 CV1.push_back( 7835 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7836 CV1.push_back( 7837 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7838 Constant *C1 = ConstantVector::get(CV1); 7839 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7840 7841 // Load the 64-bit value into an XMM register. 7842 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 7843 Op.getOperand(0)); 7844 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7845 MachinePointerInfo::getConstantPool(), 7846 false, false, false, 16); 7847 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 7848 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 7849 CLod0); 7850 7851 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7852 MachinePointerInfo::getConstantPool(), 7853 false, false, false, 16); 7854 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 7855 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7856 SDValue Result; 7857 7858 if (Subtarget->hasSSE3()) { 7859 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 7860 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 7861 } else { 7862 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 7863 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 7864 S2F, 0x4E, DAG); 7865 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 7866 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 7867 Sub); 7868 } 7869 7870 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 7871 DAG.getIntPtrConstant(0)); 7872} 7873 7874// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7875SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7876 SelectionDAG &DAG) const { 7877 DebugLoc dl = Op.getDebugLoc(); 7878 // FP constant to bias correct the final result. 7879 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7880 MVT::f64); 7881 7882 // Load the 32-bit value into an XMM register. 7883 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7884 Op.getOperand(0)); 7885 7886 // Zero out the upper parts of the register. 7887 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 7888 7889 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7890 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7891 DAG.getIntPtrConstant(0)); 7892 7893 // Or the load with the bias. 7894 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7895 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7896 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7897 MVT::v2f64, Load)), 7898 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7899 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7900 MVT::v2f64, Bias))); 7901 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7902 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7903 DAG.getIntPtrConstant(0)); 7904 7905 // Subtract the bias. 7906 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7907 7908 // Handle final rounding. 7909 EVT DestVT = Op.getValueType(); 7910 7911 if (DestVT.bitsLT(MVT::f64)) 7912 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7913 DAG.getIntPtrConstant(0)); 7914 if (DestVT.bitsGT(MVT::f64)) 7915 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7916 7917 // Handle final rounding. 7918 return Sub; 7919} 7920 7921SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7922 SelectionDAG &DAG) const { 7923 SDValue N0 = Op.getOperand(0); 7924 DebugLoc dl = Op.getDebugLoc(); 7925 7926 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7927 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7928 // the optimization here. 7929 if (DAG.SignBitIsZero(N0)) 7930 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7931 7932 EVT SrcVT = N0.getValueType(); 7933 EVT DstVT = Op.getValueType(); 7934 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7935 return LowerUINT_TO_FP_i64(Op, DAG); 7936 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7937 return LowerUINT_TO_FP_i32(Op, DAG); 7938 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 7939 return SDValue(); 7940 7941 // Make a 64-bit buffer, and use it to build an FILD. 7942 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7943 if (SrcVT == MVT::i32) { 7944 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7945 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7946 getPointerTy(), StackSlot, WordOff); 7947 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7948 StackSlot, MachinePointerInfo(), 7949 false, false, 0); 7950 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7951 OffsetSlot, MachinePointerInfo(), 7952 false, false, 0); 7953 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7954 return Fild; 7955 } 7956 7957 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7958 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7959 StackSlot, MachinePointerInfo(), 7960 false, false, 0); 7961 // For i64 source, we need to add the appropriate power of 2 if the input 7962 // was negative. This is the same as the optimization in 7963 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7964 // we must be careful to do the computation in x87 extended precision, not 7965 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7966 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7967 MachineMemOperand *MMO = 7968 DAG.getMachineFunction() 7969 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7970 MachineMemOperand::MOLoad, 8, 8); 7971 7972 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7973 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7974 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7975 MVT::i64, MMO); 7976 7977 APInt FF(32, 0x5F800000ULL); 7978 7979 // Check whether the sign bit is set. 7980 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7981 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7982 ISD::SETLT); 7983 7984 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7985 SDValue FudgePtr = DAG.getConstantPool( 7986 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7987 getPointerTy()); 7988 7989 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7990 SDValue Zero = DAG.getIntPtrConstant(0); 7991 SDValue Four = DAG.getIntPtrConstant(4); 7992 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7993 Zero, Four); 7994 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7995 7996 // Load the value out, extending it from f32 to f80. 7997 // FIXME: Avoid the extend by constructing the right constant pool? 7998 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7999 FudgePtr, MachinePointerInfo::getConstantPool(), 8000 MVT::f32, false, false, 4); 8001 // Extend everything to 80 bits to force it to be done on x87. 8002 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 8003 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 8004} 8005 8006std::pair<SDValue,SDValue> X86TargetLowering:: 8007FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { 8008 DebugLoc DL = Op.getDebugLoc(); 8009 8010 EVT DstTy = Op.getValueType(); 8011 8012 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 8013 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 8014 DstTy = MVT::i64; 8015 } 8016 8017 assert(DstTy.getSimpleVT() <= MVT::i64 && 8018 DstTy.getSimpleVT() >= MVT::i16 && 8019 "Unknown FP_TO_INT to lower!"); 8020 8021 // These are really Legal. 8022 if (DstTy == MVT::i32 && 8023 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8024 return std::make_pair(SDValue(), SDValue()); 8025 if (Subtarget->is64Bit() && 8026 DstTy == MVT::i64 && 8027 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8028 return std::make_pair(SDValue(), SDValue()); 8029 8030 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8031 // stack slot, or into the FTOL runtime function. 8032 MachineFunction &MF = DAG.getMachineFunction(); 8033 unsigned MemSize = DstTy.getSizeInBits()/8; 8034 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8035 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8036 8037 unsigned Opc; 8038 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8039 Opc = X86ISD::WIN_FTOL; 8040 else 8041 switch (DstTy.getSimpleVT().SimpleTy) { 8042 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8043 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8044 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8045 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8046 } 8047 8048 SDValue Chain = DAG.getEntryNode(); 8049 SDValue Value = Op.getOperand(0); 8050 EVT TheVT = Op.getOperand(0).getValueType(); 8051 // FIXME This causes a redundant load/store if the SSE-class value is already 8052 // in memory, such as if it is on the callstack. 8053 if (isScalarFPTypeInSSEReg(TheVT)) { 8054 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8055 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8056 MachinePointerInfo::getFixedStack(SSFI), 8057 false, false, 0); 8058 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8059 SDValue Ops[] = { 8060 Chain, StackSlot, DAG.getValueType(TheVT) 8061 }; 8062 8063 MachineMemOperand *MMO = 8064 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8065 MachineMemOperand::MOLoad, MemSize, MemSize); 8066 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 8067 DstTy, MMO); 8068 Chain = Value.getValue(1); 8069 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8070 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8071 } 8072 8073 MachineMemOperand *MMO = 8074 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8075 MachineMemOperand::MOStore, MemSize, MemSize); 8076 8077 if (Opc != X86ISD::WIN_FTOL) { 8078 // Build the FP_TO_INT*_IN_MEM 8079 SDValue Ops[] = { Chain, Value, StackSlot }; 8080 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8081 Ops, 3, DstTy, MMO); 8082 return std::make_pair(FIST, StackSlot); 8083 } else { 8084 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8085 DAG.getVTList(MVT::Other, MVT::Glue), 8086 Chain, Value); 8087 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8088 MVT::i32, ftol.getValue(1)); 8089 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8090 MVT::i32, eax.getValue(2)); 8091 SDValue Ops[] = { eax, edx }; 8092 SDValue pair = IsReplace 8093 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) 8094 : DAG.getMergeValues(Ops, 2, DL); 8095 return std::make_pair(pair, SDValue()); 8096 } 8097} 8098 8099SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 8100 SelectionDAG &DAG) const { 8101 if (Op.getValueType().isVector()) 8102 return SDValue(); 8103 8104 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8105 /*IsSigned=*/ true, /*IsReplace=*/ false); 8106 SDValue FIST = Vals.first, StackSlot = Vals.second; 8107 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 8108 if (FIST.getNode() == 0) return Op; 8109 8110 if (StackSlot.getNode()) 8111 // Load the result. 8112 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8113 FIST, StackSlot, MachinePointerInfo(), 8114 false, false, false, 0); 8115 8116 // The node is the result. 8117 return FIST; 8118} 8119 8120SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 8121 SelectionDAG &DAG) const { 8122 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8123 /*IsSigned=*/ false, /*IsReplace=*/ false); 8124 SDValue FIST = Vals.first, StackSlot = Vals.second; 8125 assert(FIST.getNode() && "Unexpected failure"); 8126 8127 if (StackSlot.getNode()) 8128 // Load the result. 8129 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8130 FIST, StackSlot, MachinePointerInfo(), 8131 false, false, false, 0); 8132 8133 // The node is the result. 8134 return FIST; 8135} 8136 8137SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, 8138 SelectionDAG &DAG) const { 8139 DebugLoc DL = Op.getDebugLoc(); 8140 EVT VT = Op.getValueType(); 8141 SDValue In = Op.getOperand(0); 8142 EVT SVT = In.getValueType(); 8143 8144 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 8145 8146 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 8147 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 8148 In, DAG.getUNDEF(SVT))); 8149} 8150 8151SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { 8152 LLVMContext *Context = DAG.getContext(); 8153 DebugLoc dl = Op.getDebugLoc(); 8154 EVT VT = Op.getValueType(); 8155 EVT EltVT = VT; 8156 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 8157 if (VT.isVector()) { 8158 EltVT = VT.getVectorElementType(); 8159 NumElts = VT.getVectorNumElements(); 8160 } 8161 Constant *C; 8162 if (EltVT == MVT::f64) 8163 C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 8164 else 8165 C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 8166 C = ConstantVector::getSplat(NumElts, C); 8167 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 8168 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 8169 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8170 MachinePointerInfo::getConstantPool(), 8171 false, false, false, Alignment); 8172 if (VT.isVector()) { 8173 MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8174 return DAG.getNode(ISD::BITCAST, dl, VT, 8175 DAG.getNode(ISD::AND, dl, ANDVT, 8176 DAG.getNode(ISD::BITCAST, dl, ANDVT, 8177 Op.getOperand(0)), 8178 DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); 8179 } 8180 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 8181} 8182 8183SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 8184 LLVMContext *Context = DAG.getContext(); 8185 DebugLoc dl = Op.getDebugLoc(); 8186 EVT VT = Op.getValueType(); 8187 EVT EltVT = VT; 8188 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 8189 if (VT.isVector()) { 8190 EltVT = VT.getVectorElementType(); 8191 NumElts = VT.getVectorNumElements(); 8192 } 8193 Constant *C; 8194 if (EltVT == MVT::f64) 8195 C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 8196 else 8197 C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 8198 C = ConstantVector::getSplat(NumElts, C); 8199 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 8200 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 8201 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8202 MachinePointerInfo::getConstantPool(), 8203 false, false, false, Alignment); 8204 if (VT.isVector()) { 8205 MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8206 return DAG.getNode(ISD::BITCAST, dl, VT, 8207 DAG.getNode(ISD::XOR, dl, XORVT, 8208 DAG.getNode(ISD::BITCAST, dl, XORVT, 8209 Op.getOperand(0)), 8210 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 8211 } 8212 8213 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 8214} 8215 8216SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 8217 LLVMContext *Context = DAG.getContext(); 8218 SDValue Op0 = Op.getOperand(0); 8219 SDValue Op1 = Op.getOperand(1); 8220 DebugLoc dl = Op.getDebugLoc(); 8221 EVT VT = Op.getValueType(); 8222 EVT SrcVT = Op1.getValueType(); 8223 8224 // If second operand is smaller, extend it first. 8225 if (SrcVT.bitsLT(VT)) { 8226 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 8227 SrcVT = VT; 8228 } 8229 // And if it is bigger, shrink it first. 8230 if (SrcVT.bitsGT(VT)) { 8231 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 8232 SrcVT = VT; 8233 } 8234 8235 // At this point the operands and the result should have the same 8236 // type, and that won't be f80 since that is not custom lowered. 8237 8238 // First get the sign bit of second operand. 8239 SmallVector<Constant*,4> CV; 8240 if (SrcVT == MVT::f64) { 8241 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 8242 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8243 } else { 8244 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 8245 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8246 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8247 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8248 } 8249 Constant *C = ConstantVector::get(CV); 8250 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8251 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 8252 MachinePointerInfo::getConstantPool(), 8253 false, false, false, 16); 8254 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 8255 8256 // Shift sign bit right or left if the two operands have different types. 8257 if (SrcVT.bitsGT(VT)) { 8258 // Op0 is MVT::f32, Op1 is MVT::f64. 8259 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 8260 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 8261 DAG.getConstant(32, MVT::i32)); 8262 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 8263 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 8264 DAG.getIntPtrConstant(0)); 8265 } 8266 8267 // Clear first operand sign bit. 8268 CV.clear(); 8269 if (VT == MVT::f64) { 8270 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8271 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8272 } else { 8273 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8274 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8275 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8276 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8277 } 8278 C = ConstantVector::get(CV); 8279 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8280 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8281 MachinePointerInfo::getConstantPool(), 8282 false, false, false, 16); 8283 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 8284 8285 // Or the value with the sign bit. 8286 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 8287} 8288 8289static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 8290 SDValue N0 = Op.getOperand(0); 8291 DebugLoc dl = Op.getDebugLoc(); 8292 EVT VT = Op.getValueType(); 8293 8294 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 8295 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 8296 DAG.getConstant(1, VT)); 8297 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 8298} 8299 8300// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 8301// 8302SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { 8303 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 8304 8305 if (!Subtarget->hasSSE41()) 8306 return SDValue(); 8307 8308 if (!Op->hasOneUse()) 8309 return SDValue(); 8310 8311 SDNode *N = Op.getNode(); 8312 DebugLoc DL = N->getDebugLoc(); 8313 8314 SmallVector<SDValue, 8> Opnds; 8315 DenseMap<SDValue, unsigned> VecInMap; 8316 EVT VT = MVT::Other; 8317 8318 // Recognize a special case where a vector is casted into wide integer to 8319 // test all 0s. 8320 Opnds.push_back(N->getOperand(0)); 8321 Opnds.push_back(N->getOperand(1)); 8322 8323 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 8324 SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; 8325 // BFS traverse all OR'd operands. 8326 if (I->getOpcode() == ISD::OR) { 8327 Opnds.push_back(I->getOperand(0)); 8328 Opnds.push_back(I->getOperand(1)); 8329 // Re-evaluate the number of nodes to be traversed. 8330 e += 2; // 2 more nodes (LHS and RHS) are pushed. 8331 continue; 8332 } 8333 8334 // Quit if a non-EXTRACT_VECTOR_ELT 8335 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8336 return SDValue(); 8337 8338 // Quit if without a constant index. 8339 SDValue Idx = I->getOperand(1); 8340 if (!isa<ConstantSDNode>(Idx)) 8341 return SDValue(); 8342 8343 SDValue ExtractedFromVec = I->getOperand(0); 8344 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 8345 if (M == VecInMap.end()) { 8346 VT = ExtractedFromVec.getValueType(); 8347 // Quit if not 128/256-bit vector. 8348 if (!VT.is128BitVector() && !VT.is256BitVector()) 8349 return SDValue(); 8350 // Quit if not the same type. 8351 if (VecInMap.begin() != VecInMap.end() && 8352 VT != VecInMap.begin()->first.getValueType()) 8353 return SDValue(); 8354 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 8355 } 8356 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 8357 } 8358 8359 assert((VT.is128BitVector() || VT.is256BitVector()) && 8360 "Not extracted from 128-/256-bit vector."); 8361 8362 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 8363 SmallVector<SDValue, 8> VecIns; 8364 8365 for (DenseMap<SDValue, unsigned>::const_iterator 8366 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 8367 // Quit if not all elements are used. 8368 if (I->second != FullMask) 8369 return SDValue(); 8370 VecIns.push_back(I->first); 8371 } 8372 8373 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8374 8375 // Cast all vectors into TestVT for PTEST. 8376 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 8377 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 8378 8379 // If more than one full vectors are evaluated, OR them first before PTEST. 8380 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 8381 // Each iteration will OR 2 nodes and append the result until there is only 8382 // 1 node left, i.e. the final OR'd value of all vectors. 8383 SDValue LHS = VecIns[Slot]; 8384 SDValue RHS = VecIns[Slot + 1]; 8385 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 8386 } 8387 8388 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 8389 VecIns.back(), VecIns.back()); 8390} 8391 8392/// Emit nodes that will be selected as "test Op0,Op0", or something 8393/// equivalent. 8394SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 8395 SelectionDAG &DAG) const { 8396 DebugLoc dl = Op.getDebugLoc(); 8397 8398 // CF and OF aren't always set the way we want. Determine which 8399 // of these we need. 8400 bool NeedCF = false; 8401 bool NeedOF = false; 8402 switch (X86CC) { 8403 default: break; 8404 case X86::COND_A: case X86::COND_AE: 8405 case X86::COND_B: case X86::COND_BE: 8406 NeedCF = true; 8407 break; 8408 case X86::COND_G: case X86::COND_GE: 8409 case X86::COND_L: case X86::COND_LE: 8410 case X86::COND_O: case X86::COND_NO: 8411 NeedOF = true; 8412 break; 8413 } 8414 8415 // See if we can use the EFLAGS value from the operand instead of 8416 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8417 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8418 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8419 // Emit a CMP with 0, which is the TEST pattern. 8420 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8421 DAG.getConstant(0, Op.getValueType())); 8422 8423 unsigned Opcode = 0; 8424 unsigned NumOperands = 0; 8425 8426 // Truncate operations may prevent the merge of the SETCC instruction 8427 // and the arithmetic intruction before it. Attempt to truncate the operands 8428 // of the arithmetic instruction and use a reduced bit-width instruction. 8429 bool NeedTruncation = false; 8430 SDValue ArithOp = Op; 8431 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 8432 SDValue Arith = Op->getOperand(0); 8433 // Both the trunc and the arithmetic op need to have one user each. 8434 if (Arith->hasOneUse()) 8435 switch (Arith.getOpcode()) { 8436 default: break; 8437 case ISD::ADD: 8438 case ISD::SUB: 8439 case ISD::AND: 8440 case ISD::OR: 8441 case ISD::XOR: { 8442 NeedTruncation = true; 8443 ArithOp = Arith; 8444 } 8445 } 8446 } 8447 8448 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 8449 // which may be the result of a CAST. We use the variable 'Op', which is the 8450 // non-casted variable when we check for possible users. 8451 switch (ArithOp.getOpcode()) { 8452 case ISD::ADD: 8453 // Due to an isel shortcoming, be conservative if this add is likely to be 8454 // selected as part of a load-modify-store instruction. When the root node 8455 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8456 // uses of other nodes in the match, such as the ADD in this case. This 8457 // leads to the ADD being left around and reselected, with the result being 8458 // two adds in the output. Alas, even if none our users are stores, that 8459 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8460 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8461 // climbing the DAG back to the root, and it doesn't seem to be worth the 8462 // effort. 8463 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8464 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8465 if (UI->getOpcode() != ISD::CopyToReg && 8466 UI->getOpcode() != ISD::SETCC && 8467 UI->getOpcode() != ISD::STORE) 8468 goto default_case; 8469 8470 if (ConstantSDNode *C = 8471 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 8472 // An add of one will be selected as an INC. 8473 if (C->getAPIntValue() == 1) { 8474 Opcode = X86ISD::INC; 8475 NumOperands = 1; 8476 break; 8477 } 8478 8479 // An add of negative one (subtract of one) will be selected as a DEC. 8480 if (C->getAPIntValue().isAllOnesValue()) { 8481 Opcode = X86ISD::DEC; 8482 NumOperands = 1; 8483 break; 8484 } 8485 } 8486 8487 // Otherwise use a regular EFLAGS-setting add. 8488 Opcode = X86ISD::ADD; 8489 NumOperands = 2; 8490 break; 8491 case ISD::AND: { 8492 // If the primary and result isn't used, don't bother using X86ISD::AND, 8493 // because a TEST instruction will be better. 8494 bool NonFlagUse = false; 8495 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8496 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8497 SDNode *User = *UI; 8498 unsigned UOpNo = UI.getOperandNo(); 8499 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8500 // Look pass truncate. 8501 UOpNo = User->use_begin().getOperandNo(); 8502 User = *User->use_begin(); 8503 } 8504 8505 if (User->getOpcode() != ISD::BRCOND && 8506 User->getOpcode() != ISD::SETCC && 8507 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 8508 NonFlagUse = true; 8509 break; 8510 } 8511 } 8512 8513 if (!NonFlagUse) 8514 break; 8515 } 8516 // FALL THROUGH 8517 case ISD::SUB: 8518 case ISD::OR: 8519 case ISD::XOR: 8520 // Due to the ISEL shortcoming noted above, be conservative if this op is 8521 // likely to be selected as part of a load-modify-store instruction. 8522 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8523 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8524 if (UI->getOpcode() == ISD::STORE) 8525 goto default_case; 8526 8527 // Otherwise use a regular EFLAGS-setting instruction. 8528 switch (ArithOp.getOpcode()) { 8529 default: llvm_unreachable("unexpected operator!"); 8530 case ISD::SUB: Opcode = X86ISD::SUB; break; 8531 case ISD::XOR: Opcode = X86ISD::XOR; break; 8532 case ISD::AND: Opcode = X86ISD::AND; break; 8533 case ISD::OR: { 8534 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 8535 SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); 8536 if (EFLAGS.getNode()) 8537 return EFLAGS; 8538 } 8539 Opcode = X86ISD::OR; 8540 break; 8541 } 8542 } 8543 8544 NumOperands = 2; 8545 break; 8546 case X86ISD::ADD: 8547 case X86ISD::SUB: 8548 case X86ISD::INC: 8549 case X86ISD::DEC: 8550 case X86ISD::OR: 8551 case X86ISD::XOR: 8552 case X86ISD::AND: 8553 return SDValue(Op.getNode(), 1); 8554 default: 8555 default_case: 8556 break; 8557 } 8558 8559 // If we found that truncation is beneficial, perform the truncation and 8560 // update 'Op'. 8561 if (NeedTruncation) { 8562 EVT VT = Op.getValueType(); 8563 SDValue WideVal = Op->getOperand(0); 8564 EVT WideVT = WideVal.getValueType(); 8565 unsigned ConvertedOp = 0; 8566 // Use a target machine opcode to prevent further DAGCombine 8567 // optimizations that may separate the arithmetic operations 8568 // from the setcc node. 8569 switch (WideVal.getOpcode()) { 8570 default: break; 8571 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 8572 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 8573 case ISD::AND: ConvertedOp = X86ISD::AND; break; 8574 case ISD::OR: ConvertedOp = X86ISD::OR; break; 8575 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 8576 } 8577 8578 if (ConvertedOp) { 8579 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8580 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 8581 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 8582 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 8583 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 8584 } 8585 } 8586 } 8587 8588 if (Opcode == 0) 8589 // Emit a CMP with 0, which is the TEST pattern. 8590 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8591 DAG.getConstant(0, Op.getValueType())); 8592 8593 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8594 SmallVector<SDValue, 4> Ops; 8595 for (unsigned i = 0; i != NumOperands; ++i) 8596 Ops.push_back(Op.getOperand(i)); 8597 8598 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8599 DAG.ReplaceAllUsesWith(Op, New); 8600 return SDValue(New.getNode(), 1); 8601} 8602 8603/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8604/// equivalent. 8605SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8606 SelectionDAG &DAG) const { 8607 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8608 if (C->getAPIntValue() == 0) 8609 return EmitTest(Op0, X86CC, DAG); 8610 8611 DebugLoc dl = Op0.getDebugLoc(); 8612 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 8613 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 8614 // Use SUB instead of CMP to enable CSE between SUB and CMP. 8615 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 8616 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 8617 Op0, Op1); 8618 return SDValue(Sub.getNode(), 1); 8619 } 8620 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8621} 8622 8623/// Convert a comparison if required by the subtarget. 8624SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 8625 SelectionDAG &DAG) const { 8626 // If the subtarget does not support the FUCOMI instruction, floating-point 8627 // comparisons have to be converted. 8628 if (Subtarget->hasCMov() || 8629 Cmp.getOpcode() != X86ISD::CMP || 8630 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 8631 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 8632 return Cmp; 8633 8634 // The instruction selector will select an FUCOM instruction instead of 8635 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 8636 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 8637 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 8638 DebugLoc dl = Cmp.getDebugLoc(); 8639 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 8640 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 8641 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 8642 DAG.getConstant(8, MVT::i8)); 8643 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 8644 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 8645} 8646 8647/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8648/// if it's possible. 8649SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8650 DebugLoc dl, SelectionDAG &DAG) const { 8651 SDValue Op0 = And.getOperand(0); 8652 SDValue Op1 = And.getOperand(1); 8653 if (Op0.getOpcode() == ISD::TRUNCATE) 8654 Op0 = Op0.getOperand(0); 8655 if (Op1.getOpcode() == ISD::TRUNCATE) 8656 Op1 = Op1.getOperand(0); 8657 8658 SDValue LHS, RHS; 8659 if (Op1.getOpcode() == ISD::SHL) 8660 std::swap(Op0, Op1); 8661 if (Op0.getOpcode() == ISD::SHL) { 8662 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8663 if (And00C->getZExtValue() == 1) { 8664 // If we looked past a truncate, check that it's only truncating away 8665 // known zeros. 8666 unsigned BitWidth = Op0.getValueSizeInBits(); 8667 unsigned AndBitWidth = And.getValueSizeInBits(); 8668 if (BitWidth > AndBitWidth) { 8669 APInt Zeros, Ones; 8670 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 8671 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8672 return SDValue(); 8673 } 8674 LHS = Op1; 8675 RHS = Op0.getOperand(1); 8676 } 8677 } else if (Op1.getOpcode() == ISD::Constant) { 8678 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8679 uint64_t AndRHSVal = AndRHS->getZExtValue(); 8680 SDValue AndLHS = Op0; 8681 8682 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 8683 LHS = AndLHS.getOperand(0); 8684 RHS = AndLHS.getOperand(1); 8685 } 8686 8687 // Use BT if the immediate can't be encoded in a TEST instruction. 8688 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 8689 LHS = AndLHS; 8690 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 8691 } 8692 } 8693 8694 if (LHS.getNode()) { 8695 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8696 // instruction. Since the shift amount is in-range-or-undefined, we know 8697 // that doing a bittest on the i32 value is ok. We extend to i32 because 8698 // the encoding for the i16 version is larger than the i32 version. 8699 // Also promote i16 to i32 for performance / code size reason. 8700 if (LHS.getValueType() == MVT::i8 || 8701 LHS.getValueType() == MVT::i16) 8702 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8703 8704 // If the operand types disagree, extend the shift amount to match. Since 8705 // BT ignores high bits (like shifts) we can use anyextend. 8706 if (LHS.getValueType() != RHS.getValueType()) 8707 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8708 8709 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8710 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8711 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8712 DAG.getConstant(Cond, MVT::i8), BT); 8713 } 8714 8715 return SDValue(); 8716} 8717 8718SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8719 8720 if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); 8721 8722 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8723 SDValue Op0 = Op.getOperand(0); 8724 SDValue Op1 = Op.getOperand(1); 8725 DebugLoc dl = Op.getDebugLoc(); 8726 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8727 8728 // Optimize to BT if possible. 8729 // Lower (X & (1 << N)) == 0 to BT(X, N). 8730 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8731 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8732 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8733 Op1.getOpcode() == ISD::Constant && 8734 cast<ConstantSDNode>(Op1)->isNullValue() && 8735 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8736 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8737 if (NewSetCC.getNode()) 8738 return NewSetCC; 8739 } 8740 8741 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8742 // these. 8743 if (Op1.getOpcode() == ISD::Constant && 8744 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8745 cast<ConstantSDNode>(Op1)->isNullValue()) && 8746 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8747 8748 // If the input is a setcc, then reuse the input setcc or use a new one with 8749 // the inverted condition. 8750 if (Op0.getOpcode() == X86ISD::SETCC) { 8751 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8752 bool Invert = (CC == ISD::SETNE) ^ 8753 cast<ConstantSDNode>(Op1)->isNullValue(); 8754 if (!Invert) return Op0; 8755 8756 CCode = X86::GetOppositeBranchCondition(CCode); 8757 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8758 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8759 } 8760 } 8761 8762 bool isFP = Op1.getValueType().isFloatingPoint(); 8763 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8764 if (X86CC == X86::COND_INVALID) 8765 return SDValue(); 8766 8767 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8768 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 8769 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8770 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8771} 8772 8773// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 8774// ones, and then concatenate the result back. 8775static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 8776 EVT VT = Op.getValueType(); 8777 8778 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 8779 "Unsupported value type for operation"); 8780 8781 unsigned NumElems = VT.getVectorNumElements(); 8782 DebugLoc dl = Op.getDebugLoc(); 8783 SDValue CC = Op.getOperand(2); 8784 8785 // Extract the LHS vectors 8786 SDValue LHS = Op.getOperand(0); 8787 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 8788 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 8789 8790 // Extract the RHS vectors 8791 SDValue RHS = Op.getOperand(1); 8792 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 8793 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 8794 8795 // Issue the operation on the smaller types and concatenate the result back 8796 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 8797 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 8798 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 8799 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 8800 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 8801} 8802 8803 8804SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8805 SDValue Cond; 8806 SDValue Op0 = Op.getOperand(0); 8807 SDValue Op1 = Op.getOperand(1); 8808 SDValue CC = Op.getOperand(2); 8809 EVT VT = Op.getValueType(); 8810 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8811 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8812 DebugLoc dl = Op.getDebugLoc(); 8813 8814 if (isFP) { 8815#ifndef NDEBUG 8816 EVT EltVT = Op0.getValueType().getVectorElementType(); 8817 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 8818#endif 8819 8820 unsigned SSECC; 8821 bool Swap = false; 8822 8823 // SSE Condition code mapping: 8824 // 0 - EQ 8825 // 1 - LT 8826 // 2 - LE 8827 // 3 - UNORD 8828 // 4 - NEQ 8829 // 5 - NLT 8830 // 6 - NLE 8831 // 7 - ORD 8832 switch (SetCCOpcode) { 8833 default: llvm_unreachable("Unexpected SETCC condition"); 8834 case ISD::SETOEQ: 8835 case ISD::SETEQ: SSECC = 0; break; 8836 case ISD::SETOGT: 8837 case ISD::SETGT: Swap = true; // Fallthrough 8838 case ISD::SETLT: 8839 case ISD::SETOLT: SSECC = 1; break; 8840 case ISD::SETOGE: 8841 case ISD::SETGE: Swap = true; // Fallthrough 8842 case ISD::SETLE: 8843 case ISD::SETOLE: SSECC = 2; break; 8844 case ISD::SETUO: SSECC = 3; break; 8845 case ISD::SETUNE: 8846 case ISD::SETNE: SSECC = 4; break; 8847 case ISD::SETULE: Swap = true; // Fallthrough 8848 case ISD::SETUGE: SSECC = 5; break; 8849 case ISD::SETULT: Swap = true; // Fallthrough 8850 case ISD::SETUGT: SSECC = 6; break; 8851 case ISD::SETO: SSECC = 7; break; 8852 case ISD::SETUEQ: 8853 case ISD::SETONE: SSECC = 8; break; 8854 } 8855 if (Swap) 8856 std::swap(Op0, Op1); 8857 8858 // In the two special cases we can't handle, emit two comparisons. 8859 if (SSECC == 8) { 8860 unsigned CC0, CC1; 8861 unsigned CombineOpc; 8862 if (SetCCOpcode == ISD::SETUEQ) { 8863 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 8864 } else { 8865 assert(SetCCOpcode == ISD::SETONE); 8866 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 8867 } 8868 8869 SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8870 DAG.getConstant(CC0, MVT::i8)); 8871 SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8872 DAG.getConstant(CC1, MVT::i8)); 8873 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 8874 } 8875 // Handle all other FP comparisons here. 8876 return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8877 DAG.getConstant(SSECC, MVT::i8)); 8878 } 8879 8880 // Break 256-bit integer vector compare into smaller ones. 8881 if (VT.is256BitVector() && !Subtarget->hasAVX2()) 8882 return Lower256IntVSETCC(Op, DAG); 8883 8884 // We are handling one of the integer comparisons here. Since SSE only has 8885 // GT and EQ comparisons for integer, swapping operands and multiple 8886 // operations may be required for some comparisons. 8887 unsigned Opc; 8888 bool Swap = false, Invert = false, FlipSigns = false; 8889 8890 switch (SetCCOpcode) { 8891 default: llvm_unreachable("Unexpected SETCC condition"); 8892 case ISD::SETNE: Invert = true; 8893 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 8894 case ISD::SETLT: Swap = true; 8895 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 8896 case ISD::SETGE: Swap = true; 8897 case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; 8898 case ISD::SETULT: Swap = true; 8899 case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; 8900 case ISD::SETUGE: Swap = true; 8901 case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; 8902 } 8903 if (Swap) 8904 std::swap(Op0, Op1); 8905 8906 // Check that the operation in question is available (most are plain SSE2, 8907 // but PCMPGTQ and PCMPEQQ have different requirements). 8908 if (VT == MVT::v2i64) { 8909 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) 8910 return SDValue(); 8911 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) 8912 return SDValue(); 8913 } 8914 8915 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8916 // bits of the inputs before performing those operations. 8917 if (FlipSigns) { 8918 EVT EltVT = VT.getVectorElementType(); 8919 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8920 EltVT); 8921 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8922 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8923 SignBits.size()); 8924 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8925 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8926 } 8927 8928 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8929 8930 // If the logical-not of the result is required, perform that now. 8931 if (Invert) 8932 Result = DAG.getNOT(dl, Result, VT); 8933 8934 return Result; 8935} 8936 8937// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8938static bool isX86LogicalCmp(SDValue Op) { 8939 unsigned Opc = Op.getNode()->getOpcode(); 8940 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 8941 Opc == X86ISD::SAHF) 8942 return true; 8943 if (Op.getResNo() == 1 && 8944 (Opc == X86ISD::ADD || 8945 Opc == X86ISD::SUB || 8946 Opc == X86ISD::ADC || 8947 Opc == X86ISD::SBB || 8948 Opc == X86ISD::SMUL || 8949 Opc == X86ISD::UMUL || 8950 Opc == X86ISD::INC || 8951 Opc == X86ISD::DEC || 8952 Opc == X86ISD::OR || 8953 Opc == X86ISD::XOR || 8954 Opc == X86ISD::AND)) 8955 return true; 8956 8957 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8958 return true; 8959 8960 return false; 8961} 8962 8963static bool isZero(SDValue V) { 8964 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8965 return C && C->isNullValue(); 8966} 8967 8968static bool isAllOnes(SDValue V) { 8969 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8970 return C && C->isAllOnesValue(); 8971} 8972 8973static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 8974 if (V.getOpcode() != ISD::TRUNCATE) 8975 return false; 8976 8977 SDValue VOp0 = V.getOperand(0); 8978 unsigned InBits = VOp0.getValueSizeInBits(); 8979 unsigned Bits = V.getValueSizeInBits(); 8980 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 8981} 8982 8983SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8984 bool addTest = true; 8985 SDValue Cond = Op.getOperand(0); 8986 SDValue Op1 = Op.getOperand(1); 8987 SDValue Op2 = Op.getOperand(2); 8988 DebugLoc DL = Op.getDebugLoc(); 8989 SDValue CC; 8990 8991 if (Cond.getOpcode() == ISD::SETCC) { 8992 SDValue NewCond = LowerSETCC(Cond, DAG); 8993 if (NewCond.getNode()) 8994 Cond = NewCond; 8995 } 8996 8997 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8998 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8999 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 9000 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 9001 if (Cond.getOpcode() == X86ISD::SETCC && 9002 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 9003 isZero(Cond.getOperand(1).getOperand(1))) { 9004 SDValue Cmp = Cond.getOperand(1); 9005 9006 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 9007 9008 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 9009 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 9010 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 9011 9012 SDValue CmpOp0 = Cmp.getOperand(0); 9013 // Apply further optimizations for special cases 9014 // (select (x != 0), -1, 0) -> neg & sbb 9015 // (select (x == 0), 0, -1) -> neg & sbb 9016 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 9017 if (YC->isNullValue() && 9018 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 9019 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 9020 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 9021 DAG.getConstant(0, CmpOp0.getValueType()), 9022 CmpOp0); 9023 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9024 DAG.getConstant(X86::COND_B, MVT::i8), 9025 SDValue(Neg.getNode(), 1)); 9026 return Res; 9027 } 9028 9029 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 9030 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 9031 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9032 9033 SDValue Res = // Res = 0 or -1. 9034 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9035 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 9036 9037 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 9038 Res = DAG.getNOT(DL, Res, Res.getValueType()); 9039 9040 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 9041 if (N2C == 0 || !N2C->isNullValue()) 9042 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 9043 return Res; 9044 } 9045 } 9046 9047 // Look past (and (setcc_carry (cmp ...)), 1). 9048 if (Cond.getOpcode() == ISD::AND && 9049 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 9050 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 9051 if (C && C->getAPIntValue() == 1) 9052 Cond = Cond.getOperand(0); 9053 } 9054 9055 // If condition flag is set by a X86ISD::CMP, then use it as the condition 9056 // setting operand in place of the X86ISD::SETCC. 9057 unsigned CondOpcode = Cond.getOpcode(); 9058 if (CondOpcode == X86ISD::SETCC || 9059 CondOpcode == X86ISD::SETCC_CARRY) { 9060 CC = Cond.getOperand(0); 9061 9062 SDValue Cmp = Cond.getOperand(1); 9063 unsigned Opc = Cmp.getOpcode(); 9064 EVT VT = Op.getValueType(); 9065 9066 bool IllegalFPCMov = false; 9067 if (VT.isFloatingPoint() && !VT.isVector() && 9068 !isScalarFPTypeInSSEReg(VT)) // FPStack? 9069 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 9070 9071 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 9072 Opc == X86ISD::BT) { // FIXME 9073 Cond = Cmp; 9074 addTest = false; 9075 } 9076 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 9077 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 9078 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 9079 Cond.getOperand(0).getValueType() != MVT::i8)) { 9080 SDValue LHS = Cond.getOperand(0); 9081 SDValue RHS = Cond.getOperand(1); 9082 unsigned X86Opcode; 9083 unsigned X86Cond; 9084 SDVTList VTs; 9085 switch (CondOpcode) { 9086 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 9087 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 9088 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 9089 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 9090 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 9091 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 9092 default: llvm_unreachable("unexpected overflowing operator"); 9093 } 9094 if (CondOpcode == ISD::UMULO) 9095 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 9096 MVT::i32); 9097 else 9098 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 9099 9100 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 9101 9102 if (CondOpcode == ISD::UMULO) 9103 Cond = X86Op.getValue(2); 9104 else 9105 Cond = X86Op.getValue(1); 9106 9107 CC = DAG.getConstant(X86Cond, MVT::i8); 9108 addTest = false; 9109 } 9110 9111 if (addTest) { 9112 // Look pass the truncate if the high bits are known zero. 9113 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 9114 Cond = Cond.getOperand(0); 9115 9116 // We know the result of AND is compared against zero. Try to match 9117 // it to BT. 9118 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 9119 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 9120 if (NewSetCC.getNode()) { 9121 CC = NewSetCC.getOperand(0); 9122 Cond = NewSetCC.getOperand(1); 9123 addTest = false; 9124 } 9125 } 9126 } 9127 9128 if (addTest) { 9129 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9130 Cond = EmitTest(Cond, X86::COND_NE, DAG); 9131 } 9132 9133 // a < b ? -1 : 0 -> RES = ~setcc_carry 9134 // a < b ? 0 : -1 -> RES = setcc_carry 9135 // a >= b ? -1 : 0 -> RES = setcc_carry 9136 // a >= b ? 0 : -1 -> RES = ~setcc_carry 9137 if (Cond.getOpcode() == X86ISD::SUB) { 9138 Cond = ConvertCmpIfNecessary(Cond, DAG); 9139 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 9140 9141 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 9142 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 9143 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9144 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 9145 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 9146 return DAG.getNOT(DL, Res, Res.getValueType()); 9147 return Res; 9148 } 9149 } 9150 9151 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 9152 // condition is true. 9153 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 9154 SDValue Ops[] = { Op2, Op1, CC, Cond }; 9155 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 9156} 9157 9158// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 9159// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 9160// from the AND / OR. 9161static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 9162 Opc = Op.getOpcode(); 9163 if (Opc != ISD::OR && Opc != ISD::AND) 9164 return false; 9165 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9166 Op.getOperand(0).hasOneUse() && 9167 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 9168 Op.getOperand(1).hasOneUse()); 9169} 9170 9171// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 9172// 1 and that the SETCC node has a single use. 9173static bool isXor1OfSetCC(SDValue Op) { 9174 if (Op.getOpcode() != ISD::XOR) 9175 return false; 9176 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 9177 if (N1C && N1C->getAPIntValue() == 1) { 9178 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9179 Op.getOperand(0).hasOneUse(); 9180 } 9181 return false; 9182} 9183 9184SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 9185 bool addTest = true; 9186 SDValue Chain = Op.getOperand(0); 9187 SDValue Cond = Op.getOperand(1); 9188 SDValue Dest = Op.getOperand(2); 9189 DebugLoc dl = Op.getDebugLoc(); 9190 SDValue CC; 9191 bool Inverted = false; 9192 9193 if (Cond.getOpcode() == ISD::SETCC) { 9194 // Check for setcc([su]{add,sub,mul}o == 0). 9195 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 9196 isa<ConstantSDNode>(Cond.getOperand(1)) && 9197 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 9198 Cond.getOperand(0).getResNo() == 1 && 9199 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 9200 Cond.getOperand(0).getOpcode() == ISD::UADDO || 9201 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 9202 Cond.getOperand(0).getOpcode() == ISD::USUBO || 9203 Cond.getOperand(0).getOpcode() == ISD::SMULO || 9204 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 9205 Inverted = true; 9206 Cond = Cond.getOperand(0); 9207 } else { 9208 SDValue NewCond = LowerSETCC(Cond, DAG); 9209 if (NewCond.getNode()) 9210 Cond = NewCond; 9211 } 9212 } 9213#if 0 9214 // FIXME: LowerXALUO doesn't handle these!! 9215 else if (Cond.getOpcode() == X86ISD::ADD || 9216 Cond.getOpcode() == X86ISD::SUB || 9217 Cond.getOpcode() == X86ISD::SMUL || 9218 Cond.getOpcode() == X86ISD::UMUL) 9219 Cond = LowerXALUO(Cond, DAG); 9220#endif 9221 9222 // Look pass (and (setcc_carry (cmp ...)), 1). 9223 if (Cond.getOpcode() == ISD::AND && 9224 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 9225 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 9226 if (C && C->getAPIntValue() == 1) 9227 Cond = Cond.getOperand(0); 9228 } 9229 9230 // If condition flag is set by a X86ISD::CMP, then use it as the condition 9231 // setting operand in place of the X86ISD::SETCC. 9232 unsigned CondOpcode = Cond.getOpcode(); 9233 if (CondOpcode == X86ISD::SETCC || 9234 CondOpcode == X86ISD::SETCC_CARRY) { 9235 CC = Cond.getOperand(0); 9236 9237 SDValue Cmp = Cond.getOperand(1); 9238 unsigned Opc = Cmp.getOpcode(); 9239 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 9240 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 9241 Cond = Cmp; 9242 addTest = false; 9243 } else { 9244 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 9245 default: break; 9246 case X86::COND_O: 9247 case X86::COND_B: 9248 // These can only come from an arithmetic instruction with overflow, 9249 // e.g. SADDO, UADDO. 9250 Cond = Cond.getNode()->getOperand(1); 9251 addTest = false; 9252 break; 9253 } 9254 } 9255 } 9256 CondOpcode = Cond.getOpcode(); 9257 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 9258 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 9259 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 9260 Cond.getOperand(0).getValueType() != MVT::i8)) { 9261 SDValue LHS = Cond.getOperand(0); 9262 SDValue RHS = Cond.getOperand(1); 9263 unsigned X86Opcode; 9264 unsigned X86Cond; 9265 SDVTList VTs; 9266 switch (CondOpcode) { 9267 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 9268 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 9269 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 9270 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 9271 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 9272 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 9273 default: llvm_unreachable("unexpected overflowing operator"); 9274 } 9275 if (Inverted) 9276 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 9277 if (CondOpcode == ISD::UMULO) 9278 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 9279 MVT::i32); 9280 else 9281 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 9282 9283 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 9284 9285 if (CondOpcode == ISD::UMULO) 9286 Cond = X86Op.getValue(2); 9287 else 9288 Cond = X86Op.getValue(1); 9289 9290 CC = DAG.getConstant(X86Cond, MVT::i8); 9291 addTest = false; 9292 } else { 9293 unsigned CondOpc; 9294 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 9295 SDValue Cmp = Cond.getOperand(0).getOperand(1); 9296 if (CondOpc == ISD::OR) { 9297 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 9298 // two branches instead of an explicit OR instruction with a 9299 // separate test. 9300 if (Cmp == Cond.getOperand(1).getOperand(1) && 9301 isX86LogicalCmp(Cmp)) { 9302 CC = Cond.getOperand(0).getOperand(0); 9303 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9304 Chain, Dest, CC, Cmp); 9305 CC = Cond.getOperand(1).getOperand(0); 9306 Cond = Cmp; 9307 addTest = false; 9308 } 9309 } else { // ISD::AND 9310 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 9311 // two branches instead of an explicit AND instruction with a 9312 // separate test. However, we only do this if this block doesn't 9313 // have a fall-through edge, because this requires an explicit 9314 // jmp when the condition is false. 9315 if (Cmp == Cond.getOperand(1).getOperand(1) && 9316 isX86LogicalCmp(Cmp) && 9317 Op.getNode()->hasOneUse()) { 9318 X86::CondCode CCode = 9319 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9320 CCode = X86::GetOppositeBranchCondition(CCode); 9321 CC = DAG.getConstant(CCode, MVT::i8); 9322 SDNode *User = *Op.getNode()->use_begin(); 9323 // Look for an unconditional branch following this conditional branch. 9324 // We need this because we need to reverse the successors in order 9325 // to implement FCMP_OEQ. 9326 if (User->getOpcode() == ISD::BR) { 9327 SDValue FalseBB = User->getOperand(1); 9328 SDNode *NewBR = 9329 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9330 assert(NewBR == User); 9331 (void)NewBR; 9332 Dest = FalseBB; 9333 9334 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9335 Chain, Dest, CC, Cmp); 9336 X86::CondCode CCode = 9337 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 9338 CCode = X86::GetOppositeBranchCondition(CCode); 9339 CC = DAG.getConstant(CCode, MVT::i8); 9340 Cond = Cmp; 9341 addTest = false; 9342 } 9343 } 9344 } 9345 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 9346 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 9347 // It should be transformed during dag combiner except when the condition 9348 // is set by a arithmetics with overflow node. 9349 X86::CondCode CCode = 9350 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9351 CCode = X86::GetOppositeBranchCondition(CCode); 9352 CC = DAG.getConstant(CCode, MVT::i8); 9353 Cond = Cond.getOperand(0).getOperand(1); 9354 addTest = false; 9355 } else if (Cond.getOpcode() == ISD::SETCC && 9356 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 9357 // For FCMP_OEQ, we can emit 9358 // two branches instead of an explicit AND instruction with a 9359 // separate test. However, we only do this if this block doesn't 9360 // have a fall-through edge, because this requires an explicit 9361 // jmp when the condition is false. 9362 if (Op.getNode()->hasOneUse()) { 9363 SDNode *User = *Op.getNode()->use_begin(); 9364 // Look for an unconditional branch following this conditional branch. 9365 // We need this because we need to reverse the successors in order 9366 // to implement FCMP_OEQ. 9367 if (User->getOpcode() == ISD::BR) { 9368 SDValue FalseBB = User->getOperand(1); 9369 SDNode *NewBR = 9370 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9371 assert(NewBR == User); 9372 (void)NewBR; 9373 Dest = FalseBB; 9374 9375 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9376 Cond.getOperand(0), Cond.getOperand(1)); 9377 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9378 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9379 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9380 Chain, Dest, CC, Cmp); 9381 CC = DAG.getConstant(X86::COND_P, MVT::i8); 9382 Cond = Cmp; 9383 addTest = false; 9384 } 9385 } 9386 } else if (Cond.getOpcode() == ISD::SETCC && 9387 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 9388 // For FCMP_UNE, we can emit 9389 // two branches instead of an explicit AND instruction with a 9390 // separate test. However, we only do this if this block doesn't 9391 // have a fall-through edge, because this requires an explicit 9392 // jmp when the condition is false. 9393 if (Op.getNode()->hasOneUse()) { 9394 SDNode *User = *Op.getNode()->use_begin(); 9395 // Look for an unconditional branch following this conditional branch. 9396 // We need this because we need to reverse the successors in order 9397 // to implement FCMP_UNE. 9398 if (User->getOpcode() == ISD::BR) { 9399 SDValue FalseBB = User->getOperand(1); 9400 SDNode *NewBR = 9401 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9402 assert(NewBR == User); 9403 (void)NewBR; 9404 9405 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9406 Cond.getOperand(0), Cond.getOperand(1)); 9407 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9408 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9409 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9410 Chain, Dest, CC, Cmp); 9411 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 9412 Cond = Cmp; 9413 addTest = false; 9414 Dest = FalseBB; 9415 } 9416 } 9417 } 9418 } 9419 9420 if (addTest) { 9421 // Look pass the truncate if the high bits are known zero. 9422 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 9423 Cond = Cond.getOperand(0); 9424 9425 // We know the result of AND is compared against zero. Try to match 9426 // it to BT. 9427 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 9428 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 9429 if (NewSetCC.getNode()) { 9430 CC = NewSetCC.getOperand(0); 9431 Cond = NewSetCC.getOperand(1); 9432 addTest = false; 9433 } 9434 } 9435 } 9436 9437 if (addTest) { 9438 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9439 Cond = EmitTest(Cond, X86::COND_NE, DAG); 9440 } 9441 Cond = ConvertCmpIfNecessary(Cond, DAG); 9442 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9443 Chain, Dest, CC, Cond); 9444} 9445 9446 9447// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 9448// Calls to _alloca is needed to probe the stack when allocating more than 4k 9449// bytes in one go. Touching the stack at 4K increments is necessary to ensure 9450// that the guard pages used by the OS virtual memory manager are allocated in 9451// correct sequence. 9452SDValue 9453X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 9454 SelectionDAG &DAG) const { 9455 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 9456 getTargetMachine().Options.EnableSegmentedStacks) && 9457 "This should be used only on Windows targets or when segmented stacks " 9458 "are being used"); 9459 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 9460 DebugLoc dl = Op.getDebugLoc(); 9461 9462 // Get the inputs. 9463 SDValue Chain = Op.getOperand(0); 9464 SDValue Size = Op.getOperand(1); 9465 // FIXME: Ensure alignment here 9466 9467 bool Is64Bit = Subtarget->is64Bit(); 9468 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 9469 9470 if (getTargetMachine().Options.EnableSegmentedStacks) { 9471 MachineFunction &MF = DAG.getMachineFunction(); 9472 MachineRegisterInfo &MRI = MF.getRegInfo(); 9473 9474 if (Is64Bit) { 9475 // The 64 bit implementation of segmented stacks needs to clobber both r10 9476 // r11. This makes it impossible to use it along with nested parameters. 9477 const Function *F = MF.getFunction(); 9478 9479 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 9480 I != E; ++I) 9481 if (I->hasNestAttr()) 9482 report_fatal_error("Cannot use segmented stacks with functions that " 9483 "have nested arguments."); 9484 } 9485 9486 const TargetRegisterClass *AddrRegClass = 9487 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 9488 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 9489 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 9490 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 9491 DAG.getRegister(Vreg, SPTy)); 9492 SDValue Ops1[2] = { Value, Chain }; 9493 return DAG.getMergeValues(Ops1, 2, dl); 9494 } else { 9495 SDValue Flag; 9496 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 9497 9498 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 9499 Flag = Chain.getValue(1); 9500 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 9501 9502 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 9503 Flag = Chain.getValue(1); 9504 9505 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 9506 9507 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 9508 return DAG.getMergeValues(Ops1, 2, dl); 9509 } 9510} 9511 9512SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 9513 MachineFunction &MF = DAG.getMachineFunction(); 9514 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 9515 9516 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9517 DebugLoc DL = Op.getDebugLoc(); 9518 9519 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 9520 // vastart just stores the address of the VarArgsFrameIndex slot into the 9521 // memory location argument. 9522 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9523 getPointerTy()); 9524 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9525 MachinePointerInfo(SV), false, false, 0); 9526 } 9527 9528 // __va_list_tag: 9529 // gp_offset (0 - 6 * 8) 9530 // fp_offset (48 - 48 + 8 * 16) 9531 // overflow_arg_area (point to parameters coming in memory). 9532 // reg_save_area 9533 SmallVector<SDValue, 8> MemOps; 9534 SDValue FIN = Op.getOperand(1); 9535 // Store gp_offset 9536 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 9537 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 9538 MVT::i32), 9539 FIN, MachinePointerInfo(SV), false, false, 0); 9540 MemOps.push_back(Store); 9541 9542 // Store fp_offset 9543 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9544 FIN, DAG.getIntPtrConstant(4)); 9545 Store = DAG.getStore(Op.getOperand(0), DL, 9546 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 9547 MVT::i32), 9548 FIN, MachinePointerInfo(SV, 4), false, false, 0); 9549 MemOps.push_back(Store); 9550 9551 // Store ptr to overflow_arg_area 9552 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9553 FIN, DAG.getIntPtrConstant(4)); 9554 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9555 getPointerTy()); 9556 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 9557 MachinePointerInfo(SV, 8), 9558 false, false, 0); 9559 MemOps.push_back(Store); 9560 9561 // Store ptr to reg_save_area. 9562 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9563 FIN, DAG.getIntPtrConstant(8)); 9564 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 9565 getPointerTy()); 9566 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 9567 MachinePointerInfo(SV, 16), false, false, 0); 9568 MemOps.push_back(Store); 9569 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 9570 &MemOps[0], MemOps.size()); 9571} 9572 9573SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9574 assert(Subtarget->is64Bit() && 9575 "LowerVAARG only handles 64-bit va_arg!"); 9576 assert((Subtarget->isTargetLinux() || 9577 Subtarget->isTargetDarwin()) && 9578 "Unhandled target in LowerVAARG"); 9579 assert(Op.getNode()->getNumOperands() == 4); 9580 SDValue Chain = Op.getOperand(0); 9581 SDValue SrcPtr = Op.getOperand(1); 9582 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9583 unsigned Align = Op.getConstantOperandVal(3); 9584 DebugLoc dl = Op.getDebugLoc(); 9585 9586 EVT ArgVT = Op.getNode()->getValueType(0); 9587 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9588 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 9589 uint8_t ArgMode; 9590 9591 // Decide which area this value should be read from. 9592 // TODO: Implement the AMD64 ABI in its entirety. This simple 9593 // selection mechanism works only for the basic types. 9594 if (ArgVT == MVT::f80) { 9595 llvm_unreachable("va_arg for f80 not yet implemented"); 9596 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 9597 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 9598 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 9599 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 9600 } else { 9601 llvm_unreachable("Unhandled argument type in LowerVAARG"); 9602 } 9603 9604 if (ArgMode == 2) { 9605 // Sanity Check: Make sure using fp_offset makes sense. 9606 assert(!getTargetMachine().Options.UseSoftFloat && 9607 !(DAG.getMachineFunction() 9608 .getFunction()->getFnAttributes() 9609 .hasAttribute(Attributes::NoImplicitFloat)) && 9610 Subtarget->hasSSE1()); 9611 } 9612 9613 // Insert VAARG_64 node into the DAG 9614 // VAARG_64 returns two values: Variable Argument Address, Chain 9615 SmallVector<SDValue, 11> InstOps; 9616 InstOps.push_back(Chain); 9617 InstOps.push_back(SrcPtr); 9618 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 9619 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 9620 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 9621 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 9622 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 9623 VTs, &InstOps[0], InstOps.size(), 9624 MVT::i64, 9625 MachinePointerInfo(SV), 9626 /*Align=*/0, 9627 /*Volatile=*/false, 9628 /*ReadMem=*/true, 9629 /*WriteMem=*/true); 9630 Chain = VAARG.getValue(1); 9631 9632 // Load the next argument and return it 9633 return DAG.getLoad(ArgVT, dl, 9634 Chain, 9635 VAARG, 9636 MachinePointerInfo(), 9637 false, false, false, 0); 9638} 9639 9640static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 9641 SelectionDAG &DAG) { 9642 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 9643 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 9644 SDValue Chain = Op.getOperand(0); 9645 SDValue DstPtr = Op.getOperand(1); 9646 SDValue SrcPtr = Op.getOperand(2); 9647 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9648 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9649 DebugLoc DL = Op.getDebugLoc(); 9650 9651 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 9652 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 9653 false, 9654 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 9655} 9656 9657// getTargetVShiftNOde - Handle vector element shifts where the shift amount 9658// may or may not be a constant. Takes immediate version of shift as input. 9659static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, 9660 SDValue SrcOp, SDValue ShAmt, 9661 SelectionDAG &DAG) { 9662 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 9663 9664 if (isa<ConstantSDNode>(ShAmt)) { 9665 // Constant may be a TargetConstant. Use a regular constant. 9666 uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 9667 switch (Opc) { 9668 default: llvm_unreachable("Unknown target vector shift node"); 9669 case X86ISD::VSHLI: 9670 case X86ISD::VSRLI: 9671 case X86ISD::VSRAI: 9672 return DAG.getNode(Opc, dl, VT, SrcOp, 9673 DAG.getConstant(ShiftAmt, MVT::i32)); 9674 } 9675 } 9676 9677 // Change opcode to non-immediate version 9678 switch (Opc) { 9679 default: llvm_unreachable("Unknown target vector shift node"); 9680 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 9681 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 9682 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 9683 } 9684 9685 // Need to build a vector containing shift amount 9686 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 9687 SDValue ShOps[4]; 9688 ShOps[0] = ShAmt; 9689 ShOps[1] = DAG.getConstant(0, MVT::i32); 9690 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 9691 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 9692 9693 // The return type has to be a 128-bit type with the same element 9694 // type as the input type. 9695 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9696 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 9697 9698 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 9699 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 9700} 9701 9702static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 9703 DebugLoc dl = Op.getDebugLoc(); 9704 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9705 switch (IntNo) { 9706 default: return SDValue(); // Don't custom lower most intrinsics. 9707 // Comparison intrinsics. 9708 case Intrinsic::x86_sse_comieq_ss: 9709 case Intrinsic::x86_sse_comilt_ss: 9710 case Intrinsic::x86_sse_comile_ss: 9711 case Intrinsic::x86_sse_comigt_ss: 9712 case Intrinsic::x86_sse_comige_ss: 9713 case Intrinsic::x86_sse_comineq_ss: 9714 case Intrinsic::x86_sse_ucomieq_ss: 9715 case Intrinsic::x86_sse_ucomilt_ss: 9716 case Intrinsic::x86_sse_ucomile_ss: 9717 case Intrinsic::x86_sse_ucomigt_ss: 9718 case Intrinsic::x86_sse_ucomige_ss: 9719 case Intrinsic::x86_sse_ucomineq_ss: 9720 case Intrinsic::x86_sse2_comieq_sd: 9721 case Intrinsic::x86_sse2_comilt_sd: 9722 case Intrinsic::x86_sse2_comile_sd: 9723 case Intrinsic::x86_sse2_comigt_sd: 9724 case Intrinsic::x86_sse2_comige_sd: 9725 case Intrinsic::x86_sse2_comineq_sd: 9726 case Intrinsic::x86_sse2_ucomieq_sd: 9727 case Intrinsic::x86_sse2_ucomilt_sd: 9728 case Intrinsic::x86_sse2_ucomile_sd: 9729 case Intrinsic::x86_sse2_ucomigt_sd: 9730 case Intrinsic::x86_sse2_ucomige_sd: 9731 case Intrinsic::x86_sse2_ucomineq_sd: { 9732 unsigned Opc; 9733 ISD::CondCode CC; 9734 switch (IntNo) { 9735 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9736 case Intrinsic::x86_sse_comieq_ss: 9737 case Intrinsic::x86_sse2_comieq_sd: 9738 Opc = X86ISD::COMI; 9739 CC = ISD::SETEQ; 9740 break; 9741 case Intrinsic::x86_sse_comilt_ss: 9742 case Intrinsic::x86_sse2_comilt_sd: 9743 Opc = X86ISD::COMI; 9744 CC = ISD::SETLT; 9745 break; 9746 case Intrinsic::x86_sse_comile_ss: 9747 case Intrinsic::x86_sse2_comile_sd: 9748 Opc = X86ISD::COMI; 9749 CC = ISD::SETLE; 9750 break; 9751 case Intrinsic::x86_sse_comigt_ss: 9752 case Intrinsic::x86_sse2_comigt_sd: 9753 Opc = X86ISD::COMI; 9754 CC = ISD::SETGT; 9755 break; 9756 case Intrinsic::x86_sse_comige_ss: 9757 case Intrinsic::x86_sse2_comige_sd: 9758 Opc = X86ISD::COMI; 9759 CC = ISD::SETGE; 9760 break; 9761 case Intrinsic::x86_sse_comineq_ss: 9762 case Intrinsic::x86_sse2_comineq_sd: 9763 Opc = X86ISD::COMI; 9764 CC = ISD::SETNE; 9765 break; 9766 case Intrinsic::x86_sse_ucomieq_ss: 9767 case Intrinsic::x86_sse2_ucomieq_sd: 9768 Opc = X86ISD::UCOMI; 9769 CC = ISD::SETEQ; 9770 break; 9771 case Intrinsic::x86_sse_ucomilt_ss: 9772 case Intrinsic::x86_sse2_ucomilt_sd: 9773 Opc = X86ISD::UCOMI; 9774 CC = ISD::SETLT; 9775 break; 9776 case Intrinsic::x86_sse_ucomile_ss: 9777 case Intrinsic::x86_sse2_ucomile_sd: 9778 Opc = X86ISD::UCOMI; 9779 CC = ISD::SETLE; 9780 break; 9781 case Intrinsic::x86_sse_ucomigt_ss: 9782 case Intrinsic::x86_sse2_ucomigt_sd: 9783 Opc = X86ISD::UCOMI; 9784 CC = ISD::SETGT; 9785 break; 9786 case Intrinsic::x86_sse_ucomige_ss: 9787 case Intrinsic::x86_sse2_ucomige_sd: 9788 Opc = X86ISD::UCOMI; 9789 CC = ISD::SETGE; 9790 break; 9791 case Intrinsic::x86_sse_ucomineq_ss: 9792 case Intrinsic::x86_sse2_ucomineq_sd: 9793 Opc = X86ISD::UCOMI; 9794 CC = ISD::SETNE; 9795 break; 9796 } 9797 9798 SDValue LHS = Op.getOperand(1); 9799 SDValue RHS = Op.getOperand(2); 9800 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 9801 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 9802 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 9803 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9804 DAG.getConstant(X86CC, MVT::i8), Cond); 9805 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9806 } 9807 9808 // Arithmetic intrinsics. 9809 case Intrinsic::x86_sse2_pmulu_dq: 9810 case Intrinsic::x86_avx2_pmulu_dq: 9811 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 9812 Op.getOperand(1), Op.getOperand(2)); 9813 9814 // SSE3/AVX horizontal add/sub intrinsics 9815 case Intrinsic::x86_sse3_hadd_ps: 9816 case Intrinsic::x86_sse3_hadd_pd: 9817 case Intrinsic::x86_avx_hadd_ps_256: 9818 case Intrinsic::x86_avx_hadd_pd_256: 9819 case Intrinsic::x86_sse3_hsub_ps: 9820 case Intrinsic::x86_sse3_hsub_pd: 9821 case Intrinsic::x86_avx_hsub_ps_256: 9822 case Intrinsic::x86_avx_hsub_pd_256: 9823 case Intrinsic::x86_ssse3_phadd_w_128: 9824 case Intrinsic::x86_ssse3_phadd_d_128: 9825 case Intrinsic::x86_avx2_phadd_w: 9826 case Intrinsic::x86_avx2_phadd_d: 9827 case Intrinsic::x86_ssse3_phsub_w_128: 9828 case Intrinsic::x86_ssse3_phsub_d_128: 9829 case Intrinsic::x86_avx2_phsub_w: 9830 case Intrinsic::x86_avx2_phsub_d: { 9831 unsigned Opcode; 9832 switch (IntNo) { 9833 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9834 case Intrinsic::x86_sse3_hadd_ps: 9835 case Intrinsic::x86_sse3_hadd_pd: 9836 case Intrinsic::x86_avx_hadd_ps_256: 9837 case Intrinsic::x86_avx_hadd_pd_256: 9838 Opcode = X86ISD::FHADD; 9839 break; 9840 case Intrinsic::x86_sse3_hsub_ps: 9841 case Intrinsic::x86_sse3_hsub_pd: 9842 case Intrinsic::x86_avx_hsub_ps_256: 9843 case Intrinsic::x86_avx_hsub_pd_256: 9844 Opcode = X86ISD::FHSUB; 9845 break; 9846 case Intrinsic::x86_ssse3_phadd_w_128: 9847 case Intrinsic::x86_ssse3_phadd_d_128: 9848 case Intrinsic::x86_avx2_phadd_w: 9849 case Intrinsic::x86_avx2_phadd_d: 9850 Opcode = X86ISD::HADD; 9851 break; 9852 case Intrinsic::x86_ssse3_phsub_w_128: 9853 case Intrinsic::x86_ssse3_phsub_d_128: 9854 case Intrinsic::x86_avx2_phsub_w: 9855 case Intrinsic::x86_avx2_phsub_d: 9856 Opcode = X86ISD::HSUB; 9857 break; 9858 } 9859 return DAG.getNode(Opcode, dl, Op.getValueType(), 9860 Op.getOperand(1), Op.getOperand(2)); 9861 } 9862 9863 // AVX2 variable shift intrinsics 9864 case Intrinsic::x86_avx2_psllv_d: 9865 case Intrinsic::x86_avx2_psllv_q: 9866 case Intrinsic::x86_avx2_psllv_d_256: 9867 case Intrinsic::x86_avx2_psllv_q_256: 9868 case Intrinsic::x86_avx2_psrlv_d: 9869 case Intrinsic::x86_avx2_psrlv_q: 9870 case Intrinsic::x86_avx2_psrlv_d_256: 9871 case Intrinsic::x86_avx2_psrlv_q_256: 9872 case Intrinsic::x86_avx2_psrav_d: 9873 case Intrinsic::x86_avx2_psrav_d_256: { 9874 unsigned Opcode; 9875 switch (IntNo) { 9876 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9877 case Intrinsic::x86_avx2_psllv_d: 9878 case Intrinsic::x86_avx2_psllv_q: 9879 case Intrinsic::x86_avx2_psllv_d_256: 9880 case Intrinsic::x86_avx2_psllv_q_256: 9881 Opcode = ISD::SHL; 9882 break; 9883 case Intrinsic::x86_avx2_psrlv_d: 9884 case Intrinsic::x86_avx2_psrlv_q: 9885 case Intrinsic::x86_avx2_psrlv_d_256: 9886 case Intrinsic::x86_avx2_psrlv_q_256: 9887 Opcode = ISD::SRL; 9888 break; 9889 case Intrinsic::x86_avx2_psrav_d: 9890 case Intrinsic::x86_avx2_psrav_d_256: 9891 Opcode = ISD::SRA; 9892 break; 9893 } 9894 return DAG.getNode(Opcode, dl, Op.getValueType(), 9895 Op.getOperand(1), Op.getOperand(2)); 9896 } 9897 9898 case Intrinsic::x86_ssse3_pshuf_b_128: 9899 case Intrinsic::x86_avx2_pshuf_b: 9900 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 9901 Op.getOperand(1), Op.getOperand(2)); 9902 9903 case Intrinsic::x86_ssse3_psign_b_128: 9904 case Intrinsic::x86_ssse3_psign_w_128: 9905 case Intrinsic::x86_ssse3_psign_d_128: 9906 case Intrinsic::x86_avx2_psign_b: 9907 case Intrinsic::x86_avx2_psign_w: 9908 case Intrinsic::x86_avx2_psign_d: 9909 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 9910 Op.getOperand(1), Op.getOperand(2)); 9911 9912 case Intrinsic::x86_sse41_insertps: 9913 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 9914 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9915 9916 case Intrinsic::x86_avx_vperm2f128_ps_256: 9917 case Intrinsic::x86_avx_vperm2f128_pd_256: 9918 case Intrinsic::x86_avx_vperm2f128_si_256: 9919 case Intrinsic::x86_avx2_vperm2i128: 9920 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 9921 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9922 9923 case Intrinsic::x86_avx2_permd: 9924 case Intrinsic::x86_avx2_permps: 9925 // Operands intentionally swapped. Mask is last operand to intrinsic, 9926 // but second operand for node/intruction. 9927 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 9928 Op.getOperand(2), Op.getOperand(1)); 9929 9930 // ptest and testp intrinsics. The intrinsic these come from are designed to 9931 // return an integer value, not just an instruction so lower it to the ptest 9932 // or testp pattern and a setcc for the result. 9933 case Intrinsic::x86_sse41_ptestz: 9934 case Intrinsic::x86_sse41_ptestc: 9935 case Intrinsic::x86_sse41_ptestnzc: 9936 case Intrinsic::x86_avx_ptestz_256: 9937 case Intrinsic::x86_avx_ptestc_256: 9938 case Intrinsic::x86_avx_ptestnzc_256: 9939 case Intrinsic::x86_avx_vtestz_ps: 9940 case Intrinsic::x86_avx_vtestc_ps: 9941 case Intrinsic::x86_avx_vtestnzc_ps: 9942 case Intrinsic::x86_avx_vtestz_pd: 9943 case Intrinsic::x86_avx_vtestc_pd: 9944 case Intrinsic::x86_avx_vtestnzc_pd: 9945 case Intrinsic::x86_avx_vtestz_ps_256: 9946 case Intrinsic::x86_avx_vtestc_ps_256: 9947 case Intrinsic::x86_avx_vtestnzc_ps_256: 9948 case Intrinsic::x86_avx_vtestz_pd_256: 9949 case Intrinsic::x86_avx_vtestc_pd_256: 9950 case Intrinsic::x86_avx_vtestnzc_pd_256: { 9951 bool IsTestPacked = false; 9952 unsigned X86CC; 9953 switch (IntNo) { 9954 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 9955 case Intrinsic::x86_avx_vtestz_ps: 9956 case Intrinsic::x86_avx_vtestz_pd: 9957 case Intrinsic::x86_avx_vtestz_ps_256: 9958 case Intrinsic::x86_avx_vtestz_pd_256: 9959 IsTestPacked = true; // Fallthrough 9960 case Intrinsic::x86_sse41_ptestz: 9961 case Intrinsic::x86_avx_ptestz_256: 9962 // ZF = 1 9963 X86CC = X86::COND_E; 9964 break; 9965 case Intrinsic::x86_avx_vtestc_ps: 9966 case Intrinsic::x86_avx_vtestc_pd: 9967 case Intrinsic::x86_avx_vtestc_ps_256: 9968 case Intrinsic::x86_avx_vtestc_pd_256: 9969 IsTestPacked = true; // Fallthrough 9970 case Intrinsic::x86_sse41_ptestc: 9971 case Intrinsic::x86_avx_ptestc_256: 9972 // CF = 1 9973 X86CC = X86::COND_B; 9974 break; 9975 case Intrinsic::x86_avx_vtestnzc_ps: 9976 case Intrinsic::x86_avx_vtestnzc_pd: 9977 case Intrinsic::x86_avx_vtestnzc_ps_256: 9978 case Intrinsic::x86_avx_vtestnzc_pd_256: 9979 IsTestPacked = true; // Fallthrough 9980 case Intrinsic::x86_sse41_ptestnzc: 9981 case Intrinsic::x86_avx_ptestnzc_256: 9982 // ZF and CF = 0 9983 X86CC = X86::COND_A; 9984 break; 9985 } 9986 9987 SDValue LHS = Op.getOperand(1); 9988 SDValue RHS = Op.getOperand(2); 9989 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 9990 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 9991 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 9992 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 9993 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9994 } 9995 9996 // SSE/AVX shift intrinsics 9997 case Intrinsic::x86_sse2_psll_w: 9998 case Intrinsic::x86_sse2_psll_d: 9999 case Intrinsic::x86_sse2_psll_q: 10000 case Intrinsic::x86_avx2_psll_w: 10001 case Intrinsic::x86_avx2_psll_d: 10002 case Intrinsic::x86_avx2_psll_q: 10003 case Intrinsic::x86_sse2_psrl_w: 10004 case Intrinsic::x86_sse2_psrl_d: 10005 case Intrinsic::x86_sse2_psrl_q: 10006 case Intrinsic::x86_avx2_psrl_w: 10007 case Intrinsic::x86_avx2_psrl_d: 10008 case Intrinsic::x86_avx2_psrl_q: 10009 case Intrinsic::x86_sse2_psra_w: 10010 case Intrinsic::x86_sse2_psra_d: 10011 case Intrinsic::x86_avx2_psra_w: 10012 case Intrinsic::x86_avx2_psra_d: { 10013 unsigned Opcode; 10014 switch (IntNo) { 10015 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10016 case Intrinsic::x86_sse2_psll_w: 10017 case Intrinsic::x86_sse2_psll_d: 10018 case Intrinsic::x86_sse2_psll_q: 10019 case Intrinsic::x86_avx2_psll_w: 10020 case Intrinsic::x86_avx2_psll_d: 10021 case Intrinsic::x86_avx2_psll_q: 10022 Opcode = X86ISD::VSHL; 10023 break; 10024 case Intrinsic::x86_sse2_psrl_w: 10025 case Intrinsic::x86_sse2_psrl_d: 10026 case Intrinsic::x86_sse2_psrl_q: 10027 case Intrinsic::x86_avx2_psrl_w: 10028 case Intrinsic::x86_avx2_psrl_d: 10029 case Intrinsic::x86_avx2_psrl_q: 10030 Opcode = X86ISD::VSRL; 10031 break; 10032 case Intrinsic::x86_sse2_psra_w: 10033 case Intrinsic::x86_sse2_psra_d: 10034 case Intrinsic::x86_avx2_psra_w: 10035 case Intrinsic::x86_avx2_psra_d: 10036 Opcode = X86ISD::VSRA; 10037 break; 10038 } 10039 return DAG.getNode(Opcode, dl, Op.getValueType(), 10040 Op.getOperand(1), Op.getOperand(2)); 10041 } 10042 10043 // SSE/AVX immediate shift intrinsics 10044 case Intrinsic::x86_sse2_pslli_w: 10045 case Intrinsic::x86_sse2_pslli_d: 10046 case Intrinsic::x86_sse2_pslli_q: 10047 case Intrinsic::x86_avx2_pslli_w: 10048 case Intrinsic::x86_avx2_pslli_d: 10049 case Intrinsic::x86_avx2_pslli_q: 10050 case Intrinsic::x86_sse2_psrli_w: 10051 case Intrinsic::x86_sse2_psrli_d: 10052 case Intrinsic::x86_sse2_psrli_q: 10053 case Intrinsic::x86_avx2_psrli_w: 10054 case Intrinsic::x86_avx2_psrli_d: 10055 case Intrinsic::x86_avx2_psrli_q: 10056 case Intrinsic::x86_sse2_psrai_w: 10057 case Intrinsic::x86_sse2_psrai_d: 10058 case Intrinsic::x86_avx2_psrai_w: 10059 case Intrinsic::x86_avx2_psrai_d: { 10060 unsigned Opcode; 10061 switch (IntNo) { 10062 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10063 case Intrinsic::x86_sse2_pslli_w: 10064 case Intrinsic::x86_sse2_pslli_d: 10065 case Intrinsic::x86_sse2_pslli_q: 10066 case Intrinsic::x86_avx2_pslli_w: 10067 case Intrinsic::x86_avx2_pslli_d: 10068 case Intrinsic::x86_avx2_pslli_q: 10069 Opcode = X86ISD::VSHLI; 10070 break; 10071 case Intrinsic::x86_sse2_psrli_w: 10072 case Intrinsic::x86_sse2_psrli_d: 10073 case Intrinsic::x86_sse2_psrli_q: 10074 case Intrinsic::x86_avx2_psrli_w: 10075 case Intrinsic::x86_avx2_psrli_d: 10076 case Intrinsic::x86_avx2_psrli_q: 10077 Opcode = X86ISD::VSRLI; 10078 break; 10079 case Intrinsic::x86_sse2_psrai_w: 10080 case Intrinsic::x86_sse2_psrai_d: 10081 case Intrinsic::x86_avx2_psrai_w: 10082 case Intrinsic::x86_avx2_psrai_d: 10083 Opcode = X86ISD::VSRAI; 10084 break; 10085 } 10086 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 10087 Op.getOperand(1), Op.getOperand(2), DAG); 10088 } 10089 10090 case Intrinsic::x86_sse42_pcmpistria128: 10091 case Intrinsic::x86_sse42_pcmpestria128: 10092 case Intrinsic::x86_sse42_pcmpistric128: 10093 case Intrinsic::x86_sse42_pcmpestric128: 10094 case Intrinsic::x86_sse42_pcmpistrio128: 10095 case Intrinsic::x86_sse42_pcmpestrio128: 10096 case Intrinsic::x86_sse42_pcmpistris128: 10097 case Intrinsic::x86_sse42_pcmpestris128: 10098 case Intrinsic::x86_sse42_pcmpistriz128: 10099 case Intrinsic::x86_sse42_pcmpestriz128: { 10100 unsigned Opcode; 10101 unsigned X86CC; 10102 switch (IntNo) { 10103 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10104 case Intrinsic::x86_sse42_pcmpistria128: 10105 Opcode = X86ISD::PCMPISTRI; 10106 X86CC = X86::COND_A; 10107 break; 10108 case Intrinsic::x86_sse42_pcmpestria128: 10109 Opcode = X86ISD::PCMPESTRI; 10110 X86CC = X86::COND_A; 10111 break; 10112 case Intrinsic::x86_sse42_pcmpistric128: 10113 Opcode = X86ISD::PCMPISTRI; 10114 X86CC = X86::COND_B; 10115 break; 10116 case Intrinsic::x86_sse42_pcmpestric128: 10117 Opcode = X86ISD::PCMPESTRI; 10118 X86CC = X86::COND_B; 10119 break; 10120 case Intrinsic::x86_sse42_pcmpistrio128: 10121 Opcode = X86ISD::PCMPISTRI; 10122 X86CC = X86::COND_O; 10123 break; 10124 case Intrinsic::x86_sse42_pcmpestrio128: 10125 Opcode = X86ISD::PCMPESTRI; 10126 X86CC = X86::COND_O; 10127 break; 10128 case Intrinsic::x86_sse42_pcmpistris128: 10129 Opcode = X86ISD::PCMPISTRI; 10130 X86CC = X86::COND_S; 10131 break; 10132 case Intrinsic::x86_sse42_pcmpestris128: 10133 Opcode = X86ISD::PCMPESTRI; 10134 X86CC = X86::COND_S; 10135 break; 10136 case Intrinsic::x86_sse42_pcmpistriz128: 10137 Opcode = X86ISD::PCMPISTRI; 10138 X86CC = X86::COND_E; 10139 break; 10140 case Intrinsic::x86_sse42_pcmpestriz128: 10141 Opcode = X86ISD::PCMPESTRI; 10142 X86CC = X86::COND_E; 10143 break; 10144 } 10145 SmallVector<SDValue, 5> NewOps; 10146 NewOps.append(Op->op_begin()+1, Op->op_end()); 10147 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10148 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10149 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10150 DAG.getConstant(X86CC, MVT::i8), 10151 SDValue(PCMP.getNode(), 1)); 10152 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 10153 } 10154 10155 case Intrinsic::x86_sse42_pcmpistri128: 10156 case Intrinsic::x86_sse42_pcmpestri128: { 10157 unsigned Opcode; 10158 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 10159 Opcode = X86ISD::PCMPISTRI; 10160 else 10161 Opcode = X86ISD::PCMPESTRI; 10162 10163 SmallVector<SDValue, 5> NewOps; 10164 NewOps.append(Op->op_begin()+1, Op->op_end()); 10165 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10166 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10167 } 10168 case Intrinsic::x86_fma_vfmadd_ps: 10169 case Intrinsic::x86_fma_vfmadd_pd: 10170 case Intrinsic::x86_fma_vfmsub_ps: 10171 case Intrinsic::x86_fma_vfmsub_pd: 10172 case Intrinsic::x86_fma_vfnmadd_ps: 10173 case Intrinsic::x86_fma_vfnmadd_pd: 10174 case Intrinsic::x86_fma_vfnmsub_ps: 10175 case Intrinsic::x86_fma_vfnmsub_pd: 10176 case Intrinsic::x86_fma_vfmaddsub_ps: 10177 case Intrinsic::x86_fma_vfmaddsub_pd: 10178 case Intrinsic::x86_fma_vfmsubadd_ps: 10179 case Intrinsic::x86_fma_vfmsubadd_pd: 10180 case Intrinsic::x86_fma_vfmadd_ps_256: 10181 case Intrinsic::x86_fma_vfmadd_pd_256: 10182 case Intrinsic::x86_fma_vfmsub_ps_256: 10183 case Intrinsic::x86_fma_vfmsub_pd_256: 10184 case Intrinsic::x86_fma_vfnmadd_ps_256: 10185 case Intrinsic::x86_fma_vfnmadd_pd_256: 10186 case Intrinsic::x86_fma_vfnmsub_ps_256: 10187 case Intrinsic::x86_fma_vfnmsub_pd_256: 10188 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10189 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10190 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10191 case Intrinsic::x86_fma_vfmsubadd_pd_256: { 10192 unsigned Opc; 10193 switch (IntNo) { 10194 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10195 case Intrinsic::x86_fma_vfmadd_ps: 10196 case Intrinsic::x86_fma_vfmadd_pd: 10197 case Intrinsic::x86_fma_vfmadd_ps_256: 10198 case Intrinsic::x86_fma_vfmadd_pd_256: 10199 Opc = X86ISD::FMADD; 10200 break; 10201 case Intrinsic::x86_fma_vfmsub_ps: 10202 case Intrinsic::x86_fma_vfmsub_pd: 10203 case Intrinsic::x86_fma_vfmsub_ps_256: 10204 case Intrinsic::x86_fma_vfmsub_pd_256: 10205 Opc = X86ISD::FMSUB; 10206 break; 10207 case Intrinsic::x86_fma_vfnmadd_ps: 10208 case Intrinsic::x86_fma_vfnmadd_pd: 10209 case Intrinsic::x86_fma_vfnmadd_ps_256: 10210 case Intrinsic::x86_fma_vfnmadd_pd_256: 10211 Opc = X86ISD::FNMADD; 10212 break; 10213 case Intrinsic::x86_fma_vfnmsub_ps: 10214 case Intrinsic::x86_fma_vfnmsub_pd: 10215 case Intrinsic::x86_fma_vfnmsub_ps_256: 10216 case Intrinsic::x86_fma_vfnmsub_pd_256: 10217 Opc = X86ISD::FNMSUB; 10218 break; 10219 case Intrinsic::x86_fma_vfmaddsub_ps: 10220 case Intrinsic::x86_fma_vfmaddsub_pd: 10221 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10222 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10223 Opc = X86ISD::FMADDSUB; 10224 break; 10225 case Intrinsic::x86_fma_vfmsubadd_ps: 10226 case Intrinsic::x86_fma_vfmsubadd_pd: 10227 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10228 case Intrinsic::x86_fma_vfmsubadd_pd_256: 10229 Opc = X86ISD::FMSUBADD; 10230 break; 10231 } 10232 10233 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 10234 Op.getOperand(2), Op.getOperand(3)); 10235 } 10236 } 10237} 10238 10239static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { 10240 DebugLoc dl = Op.getDebugLoc(); 10241 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10242 switch (IntNo) { 10243 default: return SDValue(); // Don't custom lower most intrinsics. 10244 10245 // RDRAND intrinsics. 10246 case Intrinsic::x86_rdrand_16: 10247 case Intrinsic::x86_rdrand_32: 10248 case Intrinsic::x86_rdrand_64: { 10249 // Emit the node with the right value type. 10250 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 10251 SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); 10252 10253 // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise 10254 // return the value from Rand, which is always 0, casted to i32. 10255 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 10256 DAG.getConstant(1, Op->getValueType(1)), 10257 DAG.getConstant(X86::COND_B, MVT::i32), 10258 SDValue(Result.getNode(), 1) }; 10259 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 10260 DAG.getVTList(Op->getValueType(1), MVT::Glue), 10261 Ops, 4); 10262 10263 // Return { result, isValid, chain }. 10264 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 10265 SDValue(Result.getNode(), 2)); 10266 } 10267 } 10268} 10269 10270SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 10271 SelectionDAG &DAG) const { 10272 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10273 MFI->setReturnAddressIsTaken(true); 10274 10275 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10276 DebugLoc dl = Op.getDebugLoc(); 10277 10278 if (Depth > 0) { 10279 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 10280 SDValue Offset = 10281 DAG.getConstant(TD->getPointerSize(), 10282 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 10283 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 10284 DAG.getNode(ISD::ADD, dl, getPointerTy(), 10285 FrameAddr, Offset), 10286 MachinePointerInfo(), false, false, false, 0); 10287 } 10288 10289 // Just load the return address. 10290 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 10291 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 10292 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 10293} 10294 10295SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 10296 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10297 MFI->setFrameAddressIsTaken(true); 10298 10299 EVT VT = Op.getValueType(); 10300 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 10301 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10302 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 10303 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 10304 while (Depth--) 10305 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 10306 MachinePointerInfo(), 10307 false, false, false, 0); 10308 return FrameAddr; 10309} 10310 10311SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 10312 SelectionDAG &DAG) const { 10313 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 10314} 10315 10316SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 10317 SDValue Chain = Op.getOperand(0); 10318 SDValue Offset = Op.getOperand(1); 10319 SDValue Handler = Op.getOperand(2); 10320 DebugLoc dl = Op.getDebugLoc(); 10321 10322 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 10323 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 10324 getPointerTy()); 10325 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 10326 10327 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 10328 DAG.getIntPtrConstant(TD->getPointerSize())); 10329 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 10330 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 10331 false, false, 0); 10332 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 10333 10334 return DAG.getNode(X86ISD::EH_RETURN, dl, 10335 MVT::Other, 10336 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 10337} 10338 10339static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 10340 return Op.getOperand(0); 10341} 10342 10343SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 10344 SelectionDAG &DAG) const { 10345 SDValue Root = Op.getOperand(0); 10346 SDValue Trmp = Op.getOperand(1); // trampoline 10347 SDValue FPtr = Op.getOperand(2); // nested function 10348 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 10349 DebugLoc dl = Op.getDebugLoc(); 10350 10351 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10352 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 10353 10354 if (Subtarget->is64Bit()) { 10355 SDValue OutChains[6]; 10356 10357 // Large code-model. 10358 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 10359 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 10360 10361 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 10362 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 10363 10364 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 10365 10366 // Load the pointer to the nested function into R11. 10367 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 10368 SDValue Addr = Trmp; 10369 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10370 Addr, MachinePointerInfo(TrmpAddr), 10371 false, false, 0); 10372 10373 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10374 DAG.getConstant(2, MVT::i64)); 10375 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 10376 MachinePointerInfo(TrmpAddr, 2), 10377 false, false, 2); 10378 10379 // Load the 'nest' parameter value into R10. 10380 // R10 is specified in X86CallingConv.td 10381 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 10382 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10383 DAG.getConstant(10, MVT::i64)); 10384 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10385 Addr, MachinePointerInfo(TrmpAddr, 10), 10386 false, false, 0); 10387 10388 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10389 DAG.getConstant(12, MVT::i64)); 10390 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 10391 MachinePointerInfo(TrmpAddr, 12), 10392 false, false, 2); 10393 10394 // Jump to the nested function. 10395 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 10396 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10397 DAG.getConstant(20, MVT::i64)); 10398 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10399 Addr, MachinePointerInfo(TrmpAddr, 20), 10400 false, false, 0); 10401 10402 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 10403 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10404 DAG.getConstant(22, MVT::i64)); 10405 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 10406 MachinePointerInfo(TrmpAddr, 22), 10407 false, false, 0); 10408 10409 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 10410 } else { 10411 const Function *Func = 10412 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 10413 CallingConv::ID CC = Func->getCallingConv(); 10414 unsigned NestReg; 10415 10416 switch (CC) { 10417 default: 10418 llvm_unreachable("Unsupported calling convention"); 10419 case CallingConv::C: 10420 case CallingConv::X86_StdCall: { 10421 // Pass 'nest' parameter in ECX. 10422 // Must be kept in sync with X86CallingConv.td 10423 NestReg = X86::ECX; 10424 10425 // Check that ECX wasn't needed by an 'inreg' parameter. 10426 FunctionType *FTy = Func->getFunctionType(); 10427 const AttrListPtr &Attrs = Func->getAttributes(); 10428 10429 if (!Attrs.isEmpty() && !Func->isVarArg()) { 10430 unsigned InRegCount = 0; 10431 unsigned Idx = 1; 10432 10433 for (FunctionType::param_iterator I = FTy->param_begin(), 10434 E = FTy->param_end(); I != E; ++I, ++Idx) 10435 if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg)) 10436 // FIXME: should only count parameters that are lowered to integers. 10437 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 10438 10439 if (InRegCount > 2) { 10440 report_fatal_error("Nest register in use - reduce number of inreg" 10441 " parameters!"); 10442 } 10443 } 10444 break; 10445 } 10446 case CallingConv::X86_FastCall: 10447 case CallingConv::X86_ThisCall: 10448 case CallingConv::Fast: 10449 // Pass 'nest' parameter in EAX. 10450 // Must be kept in sync with X86CallingConv.td 10451 NestReg = X86::EAX; 10452 break; 10453 } 10454 10455 SDValue OutChains[4]; 10456 SDValue Addr, Disp; 10457 10458 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10459 DAG.getConstant(10, MVT::i32)); 10460 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 10461 10462 // This is storing the opcode for MOV32ri. 10463 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 10464 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 10465 OutChains[0] = DAG.getStore(Root, dl, 10466 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 10467 Trmp, MachinePointerInfo(TrmpAddr), 10468 false, false, 0); 10469 10470 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10471 DAG.getConstant(1, MVT::i32)); 10472 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 10473 MachinePointerInfo(TrmpAddr, 1), 10474 false, false, 1); 10475 10476 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 10477 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10478 DAG.getConstant(5, MVT::i32)); 10479 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 10480 MachinePointerInfo(TrmpAddr, 5), 10481 false, false, 1); 10482 10483 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10484 DAG.getConstant(6, MVT::i32)); 10485 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 10486 MachinePointerInfo(TrmpAddr, 6), 10487 false, false, 1); 10488 10489 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 10490 } 10491} 10492 10493SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 10494 SelectionDAG &DAG) const { 10495 /* 10496 The rounding mode is in bits 11:10 of FPSR, and has the following 10497 settings: 10498 00 Round to nearest 10499 01 Round to -inf 10500 10 Round to +inf 10501 11 Round to 0 10502 10503 FLT_ROUNDS, on the other hand, expects the following: 10504 -1 Undefined 10505 0 Round to 0 10506 1 Round to nearest 10507 2 Round to +inf 10508 3 Round to -inf 10509 10510 To perform the conversion, we do: 10511 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 10512 */ 10513 10514 MachineFunction &MF = DAG.getMachineFunction(); 10515 const TargetMachine &TM = MF.getTarget(); 10516 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 10517 unsigned StackAlignment = TFI.getStackAlignment(); 10518 EVT VT = Op.getValueType(); 10519 DebugLoc DL = Op.getDebugLoc(); 10520 10521 // Save FP Control Word to stack slot 10522 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 10523 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 10524 10525 10526 MachineMemOperand *MMO = 10527 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 10528 MachineMemOperand::MOStore, 2, 2); 10529 10530 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 10531 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 10532 DAG.getVTList(MVT::Other), 10533 Ops, 2, MVT::i16, MMO); 10534 10535 // Load FP Control Word from stack slot 10536 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 10537 MachinePointerInfo(), false, false, false, 0); 10538 10539 // Transform as necessary 10540 SDValue CWD1 = 10541 DAG.getNode(ISD::SRL, DL, MVT::i16, 10542 DAG.getNode(ISD::AND, DL, MVT::i16, 10543 CWD, DAG.getConstant(0x800, MVT::i16)), 10544 DAG.getConstant(11, MVT::i8)); 10545 SDValue CWD2 = 10546 DAG.getNode(ISD::SRL, DL, MVT::i16, 10547 DAG.getNode(ISD::AND, DL, MVT::i16, 10548 CWD, DAG.getConstant(0x400, MVT::i16)), 10549 DAG.getConstant(9, MVT::i8)); 10550 10551 SDValue RetVal = 10552 DAG.getNode(ISD::AND, DL, MVT::i16, 10553 DAG.getNode(ISD::ADD, DL, MVT::i16, 10554 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 10555 DAG.getConstant(1, MVT::i16)), 10556 DAG.getConstant(3, MVT::i16)); 10557 10558 10559 return DAG.getNode((VT.getSizeInBits() < 16 ? 10560 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 10561} 10562 10563static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 10564 EVT VT = Op.getValueType(); 10565 EVT OpVT = VT; 10566 unsigned NumBits = VT.getSizeInBits(); 10567 DebugLoc dl = Op.getDebugLoc(); 10568 10569 Op = Op.getOperand(0); 10570 if (VT == MVT::i8) { 10571 // Zero extend to i32 since there is not an i8 bsr. 10572 OpVT = MVT::i32; 10573 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10574 } 10575 10576 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 10577 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10578 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10579 10580 // If src is zero (i.e. bsr sets ZF), returns NumBits. 10581 SDValue Ops[] = { 10582 Op, 10583 DAG.getConstant(NumBits+NumBits-1, OpVT), 10584 DAG.getConstant(X86::COND_E, MVT::i8), 10585 Op.getValue(1) 10586 }; 10587 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 10588 10589 // Finally xor with NumBits-1. 10590 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10591 10592 if (VT == MVT::i8) 10593 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10594 return Op; 10595} 10596 10597static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 10598 EVT VT = Op.getValueType(); 10599 EVT OpVT = VT; 10600 unsigned NumBits = VT.getSizeInBits(); 10601 DebugLoc dl = Op.getDebugLoc(); 10602 10603 Op = Op.getOperand(0); 10604 if (VT == MVT::i8) { 10605 // Zero extend to i32 since there is not an i8 bsr. 10606 OpVT = MVT::i32; 10607 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10608 } 10609 10610 // Issue a bsr (scan bits in reverse). 10611 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10612 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10613 10614 // And xor with NumBits-1. 10615 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10616 10617 if (VT == MVT::i8) 10618 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10619 return Op; 10620} 10621 10622static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 10623 EVT VT = Op.getValueType(); 10624 unsigned NumBits = VT.getSizeInBits(); 10625 DebugLoc dl = Op.getDebugLoc(); 10626 Op = Op.getOperand(0); 10627 10628 // Issue a bsf (scan bits forward) which also sets EFLAGS. 10629 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10630 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 10631 10632 // If src is zero (i.e. bsf sets ZF), returns NumBits. 10633 SDValue Ops[] = { 10634 Op, 10635 DAG.getConstant(NumBits, VT), 10636 DAG.getConstant(X86::COND_E, MVT::i8), 10637 Op.getValue(1) 10638 }; 10639 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 10640} 10641 10642// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 10643// ones, and then concatenate the result back. 10644static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 10645 EVT VT = Op.getValueType(); 10646 10647 assert(VT.is256BitVector() && VT.isInteger() && 10648 "Unsupported value type for operation"); 10649 10650 unsigned NumElems = VT.getVectorNumElements(); 10651 DebugLoc dl = Op.getDebugLoc(); 10652 10653 // Extract the LHS vectors 10654 SDValue LHS = Op.getOperand(0); 10655 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 10656 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 10657 10658 // Extract the RHS vectors 10659 SDValue RHS = Op.getOperand(1); 10660 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 10661 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 10662 10663 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10664 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10665 10666 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 10667 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 10668 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 10669} 10670 10671static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 10672 assert(Op.getValueType().is256BitVector() && 10673 Op.getValueType().isInteger() && 10674 "Only handle AVX 256-bit vector integer operation"); 10675 return Lower256IntArith(Op, DAG); 10676} 10677 10678static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 10679 assert(Op.getValueType().is256BitVector() && 10680 Op.getValueType().isInteger() && 10681 "Only handle AVX 256-bit vector integer operation"); 10682 return Lower256IntArith(Op, DAG); 10683} 10684 10685static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 10686 SelectionDAG &DAG) { 10687 EVT VT = Op.getValueType(); 10688 10689 // Decompose 256-bit ops into smaller 128-bit ops. 10690 if (VT.is256BitVector() && !Subtarget->hasAVX2()) 10691 return Lower256IntArith(Op, DAG); 10692 10693 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && 10694 "Only know how to lower V2I64/V4I64 multiply"); 10695 10696 DebugLoc dl = Op.getDebugLoc(); 10697 10698 // Ahi = psrlqi(a, 32); 10699 // Bhi = psrlqi(b, 32); 10700 // 10701 // AloBlo = pmuludq(a, b); 10702 // AloBhi = pmuludq(a, Bhi); 10703 // AhiBlo = pmuludq(Ahi, b); 10704 10705 // AloBhi = psllqi(AloBhi, 32); 10706 // AhiBlo = psllqi(AhiBlo, 32); 10707 // return AloBlo + AloBhi + AhiBlo; 10708 10709 SDValue A = Op.getOperand(0); 10710 SDValue B = Op.getOperand(1); 10711 10712 SDValue ShAmt = DAG.getConstant(32, MVT::i32); 10713 10714 SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); 10715 SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); 10716 10717 // Bit cast to 32-bit vectors for MULUDQ 10718 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; 10719 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 10720 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 10721 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 10722 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 10723 10724 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 10725 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 10726 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 10727 10728 AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); 10729 AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); 10730 10731 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 10732 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 10733} 10734 10735SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 10736 10737 EVT VT = Op.getValueType(); 10738 DebugLoc dl = Op.getDebugLoc(); 10739 SDValue R = Op.getOperand(0); 10740 SDValue Amt = Op.getOperand(1); 10741 LLVMContext *Context = DAG.getContext(); 10742 10743 if (!Subtarget->hasSSE2()) 10744 return SDValue(); 10745 10746 // Optimize shl/srl/sra with constant shift amount. 10747 if (isSplatVector(Amt.getNode())) { 10748 SDValue SclrAmt = Amt->getOperand(0); 10749 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 10750 uint64_t ShiftAmt = C->getZExtValue(); 10751 10752 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 10753 (Subtarget->hasAVX2() && 10754 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { 10755 if (Op.getOpcode() == ISD::SHL) 10756 return DAG.getNode(X86ISD::VSHLI, dl, VT, R, 10757 DAG.getConstant(ShiftAmt, MVT::i32)); 10758 if (Op.getOpcode() == ISD::SRL) 10759 return DAG.getNode(X86ISD::VSRLI, dl, VT, R, 10760 DAG.getConstant(ShiftAmt, MVT::i32)); 10761 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 10762 return DAG.getNode(X86ISD::VSRAI, dl, VT, R, 10763 DAG.getConstant(ShiftAmt, MVT::i32)); 10764 } 10765 10766 if (VT == MVT::v16i8) { 10767 if (Op.getOpcode() == ISD::SHL) { 10768 // Make a large shift. 10769 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, 10770 DAG.getConstant(ShiftAmt, MVT::i32)); 10771 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10772 // Zero out the rightmost bits. 10773 SmallVector<SDValue, 16> V(16, 10774 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10775 MVT::i8)); 10776 return DAG.getNode(ISD::AND, dl, VT, SHL, 10777 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10778 } 10779 if (Op.getOpcode() == ISD::SRL) { 10780 // Make a large shift. 10781 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, 10782 DAG.getConstant(ShiftAmt, MVT::i32)); 10783 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10784 // Zero out the leftmost bits. 10785 SmallVector<SDValue, 16> V(16, 10786 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10787 MVT::i8)); 10788 return DAG.getNode(ISD::AND, dl, VT, SRL, 10789 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10790 } 10791 if (Op.getOpcode() == ISD::SRA) { 10792 if (ShiftAmt == 7) { 10793 // R s>> 7 === R s< 0 10794 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10795 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10796 } 10797 10798 // R s>> a === ((R u>> a) ^ m) - m 10799 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10800 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 10801 MVT::i8)); 10802 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 10803 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10804 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10805 return Res; 10806 } 10807 llvm_unreachable("Unknown shift opcode."); 10808 } 10809 10810 if (Subtarget->hasAVX2() && VT == MVT::v32i8) { 10811 if (Op.getOpcode() == ISD::SHL) { 10812 // Make a large shift. 10813 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, 10814 DAG.getConstant(ShiftAmt, MVT::i32)); 10815 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10816 // Zero out the rightmost bits. 10817 SmallVector<SDValue, 32> V(32, 10818 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10819 MVT::i8)); 10820 return DAG.getNode(ISD::AND, dl, VT, SHL, 10821 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10822 } 10823 if (Op.getOpcode() == ISD::SRL) { 10824 // Make a large shift. 10825 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, 10826 DAG.getConstant(ShiftAmt, MVT::i32)); 10827 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10828 // Zero out the leftmost bits. 10829 SmallVector<SDValue, 32> V(32, 10830 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10831 MVT::i8)); 10832 return DAG.getNode(ISD::AND, dl, VT, SRL, 10833 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10834 } 10835 if (Op.getOpcode() == ISD::SRA) { 10836 if (ShiftAmt == 7) { 10837 // R s>> 7 === R s< 0 10838 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10839 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10840 } 10841 10842 // R s>> a === ((R u>> a) ^ m) - m 10843 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10844 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 10845 MVT::i8)); 10846 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 10847 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10848 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10849 return Res; 10850 } 10851 llvm_unreachable("Unknown shift opcode."); 10852 } 10853 } 10854 } 10855 10856 // Lower SHL with variable shift amount. 10857 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 10858 Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), 10859 DAG.getConstant(23, MVT::i32)); 10860 10861 const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; 10862 Constant *C = ConstantDataVector::get(*Context, CV); 10863 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 10864 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 10865 MachinePointerInfo::getConstantPool(), 10866 false, false, false, 16); 10867 10868 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 10869 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 10870 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 10871 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 10872 } 10873 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 10874 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 10875 10876 // a = a << 5; 10877 Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), 10878 DAG.getConstant(5, MVT::i32)); 10879 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 10880 10881 // Turn 'a' into a mask suitable for VSELECT 10882 SDValue VSelM = DAG.getConstant(0x80, VT); 10883 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10884 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10885 10886 SDValue CM1 = DAG.getConstant(0x0f, VT); 10887 SDValue CM2 = DAG.getConstant(0x3f, VT); 10888 10889 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 10890 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 10891 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10892 DAG.getConstant(4, MVT::i32), DAG); 10893 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10894 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10895 10896 // a += a 10897 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10898 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10899 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10900 10901 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 10902 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 10903 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10904 DAG.getConstant(2, MVT::i32), DAG); 10905 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10906 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10907 10908 // a += a 10909 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10910 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10911 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10912 10913 // return VSELECT(r, r+r, a); 10914 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 10915 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 10916 return R; 10917 } 10918 10919 // Decompose 256-bit shifts into smaller 128-bit shifts. 10920 if (VT.is256BitVector()) { 10921 unsigned NumElems = VT.getVectorNumElements(); 10922 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10923 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10924 10925 // Extract the two vectors 10926 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 10927 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 10928 10929 // Recreate the shift amount vectors 10930 SDValue Amt1, Amt2; 10931 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 10932 // Constant shift amount 10933 SmallVector<SDValue, 4> Amt1Csts; 10934 SmallVector<SDValue, 4> Amt2Csts; 10935 for (unsigned i = 0; i != NumElems/2; ++i) 10936 Amt1Csts.push_back(Amt->getOperand(i)); 10937 for (unsigned i = NumElems/2; i != NumElems; ++i) 10938 Amt2Csts.push_back(Amt->getOperand(i)); 10939 10940 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10941 &Amt1Csts[0], NumElems/2); 10942 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10943 &Amt2Csts[0], NumElems/2); 10944 } else { 10945 // Variable shift amount 10946 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 10947 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 10948 } 10949 10950 // Issue new vector shifts for the smaller types 10951 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 10952 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 10953 10954 // Concatenate the result back 10955 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 10956 } 10957 10958 return SDValue(); 10959} 10960 10961static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 10962 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 10963 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 10964 // looks for this combo and may remove the "setcc" instruction if the "setcc" 10965 // has only one use. 10966 SDNode *N = Op.getNode(); 10967 SDValue LHS = N->getOperand(0); 10968 SDValue RHS = N->getOperand(1); 10969 unsigned BaseOp = 0; 10970 unsigned Cond = 0; 10971 DebugLoc DL = Op.getDebugLoc(); 10972 switch (Op.getOpcode()) { 10973 default: llvm_unreachable("Unknown ovf instruction!"); 10974 case ISD::SADDO: 10975 // A subtract of one will be selected as a INC. Note that INC doesn't 10976 // set CF, so we can't do this for UADDO. 10977 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10978 if (C->isOne()) { 10979 BaseOp = X86ISD::INC; 10980 Cond = X86::COND_O; 10981 break; 10982 } 10983 BaseOp = X86ISD::ADD; 10984 Cond = X86::COND_O; 10985 break; 10986 case ISD::UADDO: 10987 BaseOp = X86ISD::ADD; 10988 Cond = X86::COND_B; 10989 break; 10990 case ISD::SSUBO: 10991 // A subtract of one will be selected as a DEC. Note that DEC doesn't 10992 // set CF, so we can't do this for USUBO. 10993 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10994 if (C->isOne()) { 10995 BaseOp = X86ISD::DEC; 10996 Cond = X86::COND_O; 10997 break; 10998 } 10999 BaseOp = X86ISD::SUB; 11000 Cond = X86::COND_O; 11001 break; 11002 case ISD::USUBO: 11003 BaseOp = X86ISD::SUB; 11004 Cond = X86::COND_B; 11005 break; 11006 case ISD::SMULO: 11007 BaseOp = X86ISD::SMUL; 11008 Cond = X86::COND_O; 11009 break; 11010 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 11011 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 11012 MVT::i32); 11013 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 11014 11015 SDValue SetCC = 11016 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11017 DAG.getConstant(X86::COND_O, MVT::i32), 11018 SDValue(Sum.getNode(), 2)); 11019 11020 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 11021 } 11022 } 11023 11024 // Also sets EFLAGS. 11025 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 11026 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 11027 11028 SDValue SetCC = 11029 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 11030 DAG.getConstant(Cond, MVT::i32), 11031 SDValue(Sum.getNode(), 1)); 11032 11033 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 11034} 11035 11036SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 11037 SelectionDAG &DAG) const { 11038 DebugLoc dl = Op.getDebugLoc(); 11039 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 11040 EVT VT = Op.getValueType(); 11041 11042 if (!Subtarget->hasSSE2() || !VT.isVector()) 11043 return SDValue(); 11044 11045 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 11046 ExtraVT.getScalarType().getSizeInBits(); 11047 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 11048 11049 switch (VT.getSimpleVT().SimpleTy) { 11050 default: return SDValue(); 11051 case MVT::v8i32: 11052 case MVT::v16i16: 11053 if (!Subtarget->hasAVX()) 11054 return SDValue(); 11055 if (!Subtarget->hasAVX2()) { 11056 // needs to be split 11057 unsigned NumElems = VT.getVectorNumElements(); 11058 11059 // Extract the LHS vectors 11060 SDValue LHS = Op.getOperand(0); 11061 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 11062 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 11063 11064 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11065 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 11066 11067 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 11068 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 11069 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 11070 ExtraNumElems/2); 11071 SDValue Extra = DAG.getValueType(ExtraVT); 11072 11073 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 11074 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 11075 11076 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 11077 } 11078 // fall through 11079 case MVT::v4i32: 11080 case MVT::v8i16: { 11081 SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, 11082 Op.getOperand(0), ShAmt, DAG); 11083 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); 11084 } 11085 } 11086} 11087 11088 11089static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, 11090 SelectionDAG &DAG) { 11091 DebugLoc dl = Op.getDebugLoc(); 11092 11093 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 11094 // There isn't any reason to disable it if the target processor supports it. 11095 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 11096 SDValue Chain = Op.getOperand(0); 11097 SDValue Zero = DAG.getConstant(0, MVT::i32); 11098 SDValue Ops[] = { 11099 DAG.getRegister(X86::ESP, MVT::i32), // Base 11100 DAG.getTargetConstant(1, MVT::i8), // Scale 11101 DAG.getRegister(0, MVT::i32), // Index 11102 DAG.getTargetConstant(0, MVT::i32), // Disp 11103 DAG.getRegister(0, MVT::i32), // Segment. 11104 Zero, 11105 Chain 11106 }; 11107 SDNode *Res = 11108 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 11109 array_lengthof(Ops)); 11110 return SDValue(Res, 0); 11111 } 11112 11113 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 11114 if (!isDev) 11115 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 11116 11117 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11118 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 11119 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 11120 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 11121 11122 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 11123 if (!Op1 && !Op2 && !Op3 && Op4) 11124 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 11125 11126 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 11127 if (Op1 && !Op2 && !Op3 && !Op4) 11128 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 11129 11130 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 11131 // (MFENCE)>; 11132 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 11133} 11134 11135static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 11136 SelectionDAG &DAG) { 11137 DebugLoc dl = Op.getDebugLoc(); 11138 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 11139 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 11140 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 11141 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 11142 11143 // The only fence that needs an instruction is a sequentially-consistent 11144 // cross-thread fence. 11145 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 11146 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 11147 // no-sse2). There isn't any reason to disable it if the target processor 11148 // supports it. 11149 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 11150 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 11151 11152 SDValue Chain = Op.getOperand(0); 11153 SDValue Zero = DAG.getConstant(0, MVT::i32); 11154 SDValue Ops[] = { 11155 DAG.getRegister(X86::ESP, MVT::i32), // Base 11156 DAG.getTargetConstant(1, MVT::i8), // Scale 11157 DAG.getRegister(0, MVT::i32), // Index 11158 DAG.getTargetConstant(0, MVT::i32), // Disp 11159 DAG.getRegister(0, MVT::i32), // Segment. 11160 Zero, 11161 Chain 11162 }; 11163 SDNode *Res = 11164 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 11165 array_lengthof(Ops)); 11166 return SDValue(Res, 0); 11167 } 11168 11169 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 11170 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 11171} 11172 11173 11174static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 11175 SelectionDAG &DAG) { 11176 EVT T = Op.getValueType(); 11177 DebugLoc DL = Op.getDebugLoc(); 11178 unsigned Reg = 0; 11179 unsigned size = 0; 11180 switch(T.getSimpleVT().SimpleTy) { 11181 default: llvm_unreachable("Invalid value type!"); 11182 case MVT::i8: Reg = X86::AL; size = 1; break; 11183 case MVT::i16: Reg = X86::AX; size = 2; break; 11184 case MVT::i32: Reg = X86::EAX; size = 4; break; 11185 case MVT::i64: 11186 assert(Subtarget->is64Bit() && "Node not type legal!"); 11187 Reg = X86::RAX; size = 8; 11188 break; 11189 } 11190 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 11191 Op.getOperand(2), SDValue()); 11192 SDValue Ops[] = { cpIn.getValue(0), 11193 Op.getOperand(1), 11194 Op.getOperand(3), 11195 DAG.getTargetConstant(size, MVT::i8), 11196 cpIn.getValue(1) }; 11197 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11198 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 11199 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 11200 Ops, 5, T, MMO); 11201 SDValue cpOut = 11202 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 11203 return cpOut; 11204} 11205 11206static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 11207 SelectionDAG &DAG) { 11208 assert(Subtarget->is64Bit() && "Result not type legalized?"); 11209 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11210 SDValue TheChain = Op.getOperand(0); 11211 DebugLoc dl = Op.getDebugLoc(); 11212 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11213 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 11214 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 11215 rax.getValue(2)); 11216 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 11217 DAG.getConstant(32, MVT::i8)); 11218 SDValue Ops[] = { 11219 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 11220 rdx.getValue(1) 11221 }; 11222 return DAG.getMergeValues(Ops, 2, dl); 11223} 11224 11225SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { 11226 EVT SrcVT = Op.getOperand(0).getValueType(); 11227 EVT DstVT = Op.getValueType(); 11228 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 11229 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 11230 assert((DstVT == MVT::i64 || 11231 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 11232 "Unexpected custom BITCAST"); 11233 // i64 <=> MMX conversions are Legal. 11234 if (SrcVT==MVT::i64 && DstVT.isVector()) 11235 return Op; 11236 if (DstVT==MVT::i64 && SrcVT.isVector()) 11237 return Op; 11238 // MMX <=> MMX conversions are Legal. 11239 if (SrcVT.isVector() && DstVT.isVector()) 11240 return Op; 11241 // All other conversions need to be expanded. 11242 return SDValue(); 11243} 11244 11245static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 11246 SDNode *Node = Op.getNode(); 11247 DebugLoc dl = Node->getDebugLoc(); 11248 EVT T = Node->getValueType(0); 11249 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 11250 DAG.getConstant(0, T), Node->getOperand(2)); 11251 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 11252 cast<AtomicSDNode>(Node)->getMemoryVT(), 11253 Node->getOperand(0), 11254 Node->getOperand(1), negOp, 11255 cast<AtomicSDNode>(Node)->getSrcValue(), 11256 cast<AtomicSDNode>(Node)->getAlignment(), 11257 cast<AtomicSDNode>(Node)->getOrdering(), 11258 cast<AtomicSDNode>(Node)->getSynchScope()); 11259} 11260 11261static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 11262 SDNode *Node = Op.getNode(); 11263 DebugLoc dl = Node->getDebugLoc(); 11264 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11265 11266 // Convert seq_cst store -> xchg 11267 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 11268 // FIXME: On 32-bit, store -> fist or movq would be more efficient 11269 // (The only way to get a 16-byte store is cmpxchg16b) 11270 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 11271 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 11272 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 11273 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 11274 cast<AtomicSDNode>(Node)->getMemoryVT(), 11275 Node->getOperand(0), 11276 Node->getOperand(1), Node->getOperand(2), 11277 cast<AtomicSDNode>(Node)->getMemOperand(), 11278 cast<AtomicSDNode>(Node)->getOrdering(), 11279 cast<AtomicSDNode>(Node)->getSynchScope()); 11280 return Swap.getValue(1); 11281 } 11282 // Other atomic stores have a simple pattern. 11283 return Op; 11284} 11285 11286static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 11287 EVT VT = Op.getNode()->getValueType(0); 11288 11289 // Let legalize expand this if it isn't a legal type yet. 11290 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11291 return SDValue(); 11292 11293 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 11294 11295 unsigned Opc; 11296 bool ExtraOp = false; 11297 switch (Op.getOpcode()) { 11298 default: llvm_unreachable("Invalid code"); 11299 case ISD::ADDC: Opc = X86ISD::ADD; break; 11300 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 11301 case ISD::SUBC: Opc = X86ISD::SUB; break; 11302 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 11303 } 11304 11305 if (!ExtraOp) 11306 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11307 Op.getOperand(1)); 11308 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11309 Op.getOperand(1), Op.getOperand(2)); 11310} 11311 11312/// LowerOperation - Provide custom lowering hooks for some operations. 11313/// 11314SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 11315 switch (Op.getOpcode()) { 11316 default: llvm_unreachable("Should not custom lower this!"); 11317 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 11318 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); 11319 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 11320 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); 11321 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 11322 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 11323 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 11324 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 11325 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 11326 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 11327 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 11328 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 11329 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 11330 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 11331 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 11332 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 11333 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 11334 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 11335 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 11336 case ISD::SHL_PARTS: 11337 case ISD::SRA_PARTS: 11338 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 11339 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 11340 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 11341 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 11342 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 11343 case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); 11344 case ISD::FABS: return LowerFABS(Op, DAG); 11345 case ISD::FNEG: return LowerFNEG(Op, DAG); 11346 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 11347 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 11348 case ISD::SETCC: return LowerSETCC(Op, DAG); 11349 case ISD::SELECT: return LowerSELECT(Op, DAG); 11350 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 11351 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 11352 case ISD::VASTART: return LowerVASTART(Op, DAG); 11353 case ISD::VAARG: return LowerVAARG(Op, DAG); 11354 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 11355 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 11356 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 11357 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 11358 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 11359 case ISD::FRAME_TO_ARGS_OFFSET: 11360 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 11361 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 11362 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 11363 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 11364 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 11365 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 11366 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 11367 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 11368 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 11369 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 11370 case ISD::SRA: 11371 case ISD::SRL: 11372 case ISD::SHL: return LowerShift(Op, DAG); 11373 case ISD::SADDO: 11374 case ISD::UADDO: 11375 case ISD::SSUBO: 11376 case ISD::USUBO: 11377 case ISD::SMULO: 11378 case ISD::UMULO: return LowerXALUO(Op, DAG); 11379 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 11380 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 11381 case ISD::ADDC: 11382 case ISD::ADDE: 11383 case ISD::SUBC: 11384 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 11385 case ISD::ADD: return LowerADD(Op, DAG); 11386 case ISD::SUB: return LowerSUB(Op, DAG); 11387 } 11388} 11389 11390static void ReplaceATOMIC_LOAD(SDNode *Node, 11391 SmallVectorImpl<SDValue> &Results, 11392 SelectionDAG &DAG) { 11393 DebugLoc dl = Node->getDebugLoc(); 11394 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11395 11396 // Convert wide load -> cmpxchg8b/cmpxchg16b 11397 // FIXME: On 32-bit, load -> fild or movq would be more efficient 11398 // (The only way to get a 16-byte load is cmpxchg16b) 11399 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 11400 SDValue Zero = DAG.getConstant(0, VT); 11401 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 11402 Node->getOperand(0), 11403 Node->getOperand(1), Zero, Zero, 11404 cast<AtomicSDNode>(Node)->getMemOperand(), 11405 cast<AtomicSDNode>(Node)->getOrdering(), 11406 cast<AtomicSDNode>(Node)->getSynchScope()); 11407 Results.push_back(Swap.getValue(0)); 11408 Results.push_back(Swap.getValue(1)); 11409} 11410 11411static void 11412ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 11413 SelectionDAG &DAG, unsigned NewOp) { 11414 DebugLoc dl = Node->getDebugLoc(); 11415 assert (Node->getValueType(0) == MVT::i64 && 11416 "Only know how to expand i64 atomics"); 11417 11418 SDValue Chain = Node->getOperand(0); 11419 SDValue In1 = Node->getOperand(1); 11420 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11421 Node->getOperand(2), DAG.getIntPtrConstant(0)); 11422 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11423 Node->getOperand(2), DAG.getIntPtrConstant(1)); 11424 SDValue Ops[] = { Chain, In1, In2L, In2H }; 11425 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 11426 SDValue Result = 11427 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 11428 cast<MemSDNode>(Node)->getMemOperand()); 11429 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 11430 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 11431 Results.push_back(Result.getValue(2)); 11432} 11433 11434/// ReplaceNodeResults - Replace a node with an illegal result type 11435/// with a new node built out of custom code. 11436void X86TargetLowering::ReplaceNodeResults(SDNode *N, 11437 SmallVectorImpl<SDValue>&Results, 11438 SelectionDAG &DAG) const { 11439 DebugLoc dl = N->getDebugLoc(); 11440 switch (N->getOpcode()) { 11441 default: 11442 llvm_unreachable("Do not know how to custom type legalize this operation!"); 11443 case ISD::SIGN_EXTEND_INREG: 11444 case ISD::ADDC: 11445 case ISD::ADDE: 11446 case ISD::SUBC: 11447 case ISD::SUBE: 11448 // We don't want to expand or promote these. 11449 return; 11450 case ISD::FP_TO_SINT: 11451 case ISD::FP_TO_UINT: { 11452 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 11453 11454 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 11455 return; 11456 11457 std::pair<SDValue,SDValue> Vals = 11458 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 11459 SDValue FIST = Vals.first, StackSlot = Vals.second; 11460 if (FIST.getNode() != 0) { 11461 EVT VT = N->getValueType(0); 11462 // Return a load from the stack slot. 11463 if (StackSlot.getNode() != 0) 11464 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 11465 MachinePointerInfo(), 11466 false, false, false, 0)); 11467 else 11468 Results.push_back(FIST); 11469 } 11470 return; 11471 } 11472 case ISD::FP_ROUND: { 11473 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 11474 Results.push_back(V); 11475 return; 11476 } 11477 case ISD::READCYCLECOUNTER: { 11478 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11479 SDValue TheChain = N->getOperand(0); 11480 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11481 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 11482 rd.getValue(1)); 11483 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 11484 eax.getValue(2)); 11485 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 11486 SDValue Ops[] = { eax, edx }; 11487 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 11488 Results.push_back(edx.getValue(1)); 11489 return; 11490 } 11491 case ISD::ATOMIC_CMP_SWAP: { 11492 EVT T = N->getValueType(0); 11493 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 11494 bool Regs64bit = T == MVT::i128; 11495 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 11496 SDValue cpInL, cpInH; 11497 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11498 DAG.getConstant(0, HalfT)); 11499 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11500 DAG.getConstant(1, HalfT)); 11501 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 11502 Regs64bit ? X86::RAX : X86::EAX, 11503 cpInL, SDValue()); 11504 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 11505 Regs64bit ? X86::RDX : X86::EDX, 11506 cpInH, cpInL.getValue(1)); 11507 SDValue swapInL, swapInH; 11508 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11509 DAG.getConstant(0, HalfT)); 11510 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11511 DAG.getConstant(1, HalfT)); 11512 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 11513 Regs64bit ? X86::RBX : X86::EBX, 11514 swapInL, cpInH.getValue(1)); 11515 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 11516 Regs64bit ? X86::RCX : X86::ECX, 11517 swapInH, swapInL.getValue(1)); 11518 SDValue Ops[] = { swapInH.getValue(0), 11519 N->getOperand(1), 11520 swapInH.getValue(1) }; 11521 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11522 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 11523 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 11524 X86ISD::LCMPXCHG8_DAG; 11525 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 11526 Ops, 3, T, MMO); 11527 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 11528 Regs64bit ? X86::RAX : X86::EAX, 11529 HalfT, Result.getValue(1)); 11530 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 11531 Regs64bit ? X86::RDX : X86::EDX, 11532 HalfT, cpOutL.getValue(2)); 11533 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 11534 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 11535 Results.push_back(cpOutH.getValue(1)); 11536 return; 11537 } 11538 case ISD::ATOMIC_LOAD_ADD: 11539 case ISD::ATOMIC_LOAD_AND: 11540 case ISD::ATOMIC_LOAD_NAND: 11541 case ISD::ATOMIC_LOAD_OR: 11542 case ISD::ATOMIC_LOAD_SUB: 11543 case ISD::ATOMIC_LOAD_XOR: 11544 case ISD::ATOMIC_LOAD_MAX: 11545 case ISD::ATOMIC_LOAD_MIN: 11546 case ISD::ATOMIC_LOAD_UMAX: 11547 case ISD::ATOMIC_LOAD_UMIN: 11548 case ISD::ATOMIC_SWAP: { 11549 unsigned Opc; 11550 switch (N->getOpcode()) { 11551 default: llvm_unreachable("Unexpected opcode"); 11552 case ISD::ATOMIC_LOAD_ADD: 11553 Opc = X86ISD::ATOMADD64_DAG; 11554 break; 11555 case ISD::ATOMIC_LOAD_AND: 11556 Opc = X86ISD::ATOMAND64_DAG; 11557 break; 11558 case ISD::ATOMIC_LOAD_NAND: 11559 Opc = X86ISD::ATOMNAND64_DAG; 11560 break; 11561 case ISD::ATOMIC_LOAD_OR: 11562 Opc = X86ISD::ATOMOR64_DAG; 11563 break; 11564 case ISD::ATOMIC_LOAD_SUB: 11565 Opc = X86ISD::ATOMSUB64_DAG; 11566 break; 11567 case ISD::ATOMIC_LOAD_XOR: 11568 Opc = X86ISD::ATOMXOR64_DAG; 11569 break; 11570 case ISD::ATOMIC_LOAD_MAX: 11571 Opc = X86ISD::ATOMMAX64_DAG; 11572 break; 11573 case ISD::ATOMIC_LOAD_MIN: 11574 Opc = X86ISD::ATOMMIN64_DAG; 11575 break; 11576 case ISD::ATOMIC_LOAD_UMAX: 11577 Opc = X86ISD::ATOMUMAX64_DAG; 11578 break; 11579 case ISD::ATOMIC_LOAD_UMIN: 11580 Opc = X86ISD::ATOMUMIN64_DAG; 11581 break; 11582 case ISD::ATOMIC_SWAP: 11583 Opc = X86ISD::ATOMSWAP64_DAG; 11584 break; 11585 } 11586 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 11587 return; 11588 } 11589 case ISD::ATOMIC_LOAD: 11590 ReplaceATOMIC_LOAD(N, Results, DAG); 11591 } 11592} 11593 11594const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 11595 switch (Opcode) { 11596 default: return NULL; 11597 case X86ISD::BSF: return "X86ISD::BSF"; 11598 case X86ISD::BSR: return "X86ISD::BSR"; 11599 case X86ISD::SHLD: return "X86ISD::SHLD"; 11600 case X86ISD::SHRD: return "X86ISD::SHRD"; 11601 case X86ISD::FAND: return "X86ISD::FAND"; 11602 case X86ISD::FOR: return "X86ISD::FOR"; 11603 case X86ISD::FXOR: return "X86ISD::FXOR"; 11604 case X86ISD::FSRL: return "X86ISD::FSRL"; 11605 case X86ISD::FILD: return "X86ISD::FILD"; 11606 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 11607 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 11608 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 11609 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 11610 case X86ISD::FLD: return "X86ISD::FLD"; 11611 case X86ISD::FST: return "X86ISD::FST"; 11612 case X86ISD::CALL: return "X86ISD::CALL"; 11613 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 11614 case X86ISD::BT: return "X86ISD::BT"; 11615 case X86ISD::CMP: return "X86ISD::CMP"; 11616 case X86ISD::COMI: return "X86ISD::COMI"; 11617 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 11618 case X86ISD::SETCC: return "X86ISD::SETCC"; 11619 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 11620 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 11621 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 11622 case X86ISD::CMOV: return "X86ISD::CMOV"; 11623 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 11624 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 11625 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 11626 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 11627 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 11628 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 11629 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 11630 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 11631 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 11632 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 11633 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 11634 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 11635 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 11636 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 11637 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 11638 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 11639 case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; 11640 case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; 11641 case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; 11642 case X86ISD::HADD: return "X86ISD::HADD"; 11643 case X86ISD::HSUB: return "X86ISD::HSUB"; 11644 case X86ISD::FHADD: return "X86ISD::FHADD"; 11645 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 11646 case X86ISD::FMAX: return "X86ISD::FMAX"; 11647 case X86ISD::FMIN: return "X86ISD::FMIN"; 11648 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 11649 case X86ISD::FMINC: return "X86ISD::FMINC"; 11650 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 11651 case X86ISD::FRCP: return "X86ISD::FRCP"; 11652 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 11653 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 11654 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 11655 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 11656 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 11657 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 11658 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 11659 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 11660 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 11661 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 11662 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 11663 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 11664 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 11665 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 11666 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 11667 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 11668 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 11669 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 11670 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 11671 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 11672 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 11673 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 11674 case X86ISD::VSHL: return "X86ISD::VSHL"; 11675 case X86ISD::VSRL: return "X86ISD::VSRL"; 11676 case X86ISD::VSRA: return "X86ISD::VSRA"; 11677 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 11678 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 11679 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 11680 case X86ISD::CMPP: return "X86ISD::CMPP"; 11681 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 11682 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 11683 case X86ISD::ADD: return "X86ISD::ADD"; 11684 case X86ISD::SUB: return "X86ISD::SUB"; 11685 case X86ISD::ADC: return "X86ISD::ADC"; 11686 case X86ISD::SBB: return "X86ISD::SBB"; 11687 case X86ISD::SMUL: return "X86ISD::SMUL"; 11688 case X86ISD::UMUL: return "X86ISD::UMUL"; 11689 case X86ISD::INC: return "X86ISD::INC"; 11690 case X86ISD::DEC: return "X86ISD::DEC"; 11691 case X86ISD::OR: return "X86ISD::OR"; 11692 case X86ISD::XOR: return "X86ISD::XOR"; 11693 case X86ISD::AND: return "X86ISD::AND"; 11694 case X86ISD::ANDN: return "X86ISD::ANDN"; 11695 case X86ISD::BLSI: return "X86ISD::BLSI"; 11696 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 11697 case X86ISD::BLSR: return "X86ISD::BLSR"; 11698 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 11699 case X86ISD::PTEST: return "X86ISD::PTEST"; 11700 case X86ISD::TESTP: return "X86ISD::TESTP"; 11701 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 11702 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 11703 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 11704 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 11705 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 11706 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 11707 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 11708 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 11709 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 11710 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 11711 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 11712 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 11713 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 11714 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 11715 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 11716 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 11717 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 11718 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 11719 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 11720 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 11721 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 11722 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 11723 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 11724 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 11725 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 11726 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 11727 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 11728 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 11729 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 11730 case X86ISD::SAHF: return "X86ISD::SAHF"; 11731 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 11732 case X86ISD::FMADD: return "X86ISD::FMADD"; 11733 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 11734 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 11735 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 11736 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 11737 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 11738 } 11739} 11740 11741// isLegalAddressingMode - Return true if the addressing mode represented 11742// by AM is legal for this target, for a load/store of the specified type. 11743bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 11744 Type *Ty) const { 11745 // X86 supports extremely general addressing modes. 11746 CodeModel::Model M = getTargetMachine().getCodeModel(); 11747 Reloc::Model R = getTargetMachine().getRelocationModel(); 11748 11749 // X86 allows a sign-extended 32-bit immediate field as a displacement. 11750 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 11751 return false; 11752 11753 if (AM.BaseGV) { 11754 unsigned GVFlags = 11755 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 11756 11757 // If a reference to this global requires an extra load, we can't fold it. 11758 if (isGlobalStubReference(GVFlags)) 11759 return false; 11760 11761 // If BaseGV requires a register for the PIC base, we cannot also have a 11762 // BaseReg specified. 11763 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 11764 return false; 11765 11766 // If lower 4G is not available, then we must use rip-relative addressing. 11767 if ((M != CodeModel::Small || R != Reloc::Static) && 11768 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 11769 return false; 11770 } 11771 11772 switch (AM.Scale) { 11773 case 0: 11774 case 1: 11775 case 2: 11776 case 4: 11777 case 8: 11778 // These scales always work. 11779 break; 11780 case 3: 11781 case 5: 11782 case 9: 11783 // These scales are formed with basereg+scalereg. Only accept if there is 11784 // no basereg yet. 11785 if (AM.HasBaseReg) 11786 return false; 11787 break; 11788 default: // Other stuff never works. 11789 return false; 11790 } 11791 11792 return true; 11793} 11794 11795 11796bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 11797 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11798 return false; 11799 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 11800 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 11801 if (NumBits1 <= NumBits2) 11802 return false; 11803 return true; 11804} 11805 11806bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 11807 return Imm == (int32_t)Imm; 11808} 11809 11810bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 11811 // Can also use sub to handle negated immediates. 11812 return Imm == (int32_t)Imm; 11813} 11814 11815bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 11816 if (!VT1.isInteger() || !VT2.isInteger()) 11817 return false; 11818 unsigned NumBits1 = VT1.getSizeInBits(); 11819 unsigned NumBits2 = VT2.getSizeInBits(); 11820 if (NumBits1 <= NumBits2) 11821 return false; 11822 return true; 11823} 11824 11825bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 11826 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11827 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 11828} 11829 11830bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 11831 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11832 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 11833} 11834 11835bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 11836 // i16 instructions are longer (0x66 prefix) and potentially slower. 11837 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 11838} 11839 11840/// isShuffleMaskLegal - Targets can use this to indicate that they only 11841/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 11842/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 11843/// are assumed to be legal. 11844bool 11845X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 11846 EVT VT) const { 11847 // Very little shuffling can be done for 64-bit vectors right now. 11848 if (VT.getSizeInBits() == 64) 11849 return false; 11850 11851 // FIXME: pshufb, blends, shifts. 11852 return (VT.getVectorNumElements() == 2 || 11853 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 11854 isMOVLMask(M, VT) || 11855 isSHUFPMask(M, VT, Subtarget->hasAVX()) || 11856 isPSHUFDMask(M, VT) || 11857 isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) || 11858 isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) || 11859 isPALIGNRMask(M, VT, Subtarget) || 11860 isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || 11861 isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || 11862 isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || 11863 isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); 11864} 11865 11866bool 11867X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 11868 EVT VT) const { 11869 unsigned NumElts = VT.getVectorNumElements(); 11870 // FIXME: This collection of masks seems suspect. 11871 if (NumElts == 2) 11872 return true; 11873 if (NumElts == 4 && VT.is128BitVector()) { 11874 return (isMOVLMask(Mask, VT) || 11875 isCommutedMOVLMask(Mask, VT, true) || 11876 isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || 11877 isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); 11878 } 11879 return false; 11880} 11881 11882//===----------------------------------------------------------------------===// 11883// X86 Scheduler Hooks 11884//===----------------------------------------------------------------------===// 11885 11886// private utility function 11887 11888// Get CMPXCHG opcode for the specified data type. 11889static unsigned getCmpXChgOpcode(EVT VT) { 11890 switch (VT.getSimpleVT().SimpleTy) { 11891 case MVT::i8: return X86::LCMPXCHG8; 11892 case MVT::i16: return X86::LCMPXCHG16; 11893 case MVT::i32: return X86::LCMPXCHG32; 11894 case MVT::i64: return X86::LCMPXCHG64; 11895 default: 11896 break; 11897 } 11898 llvm_unreachable("Invalid operand size!"); 11899} 11900 11901// Get LOAD opcode for the specified data type. 11902static unsigned getLoadOpcode(EVT VT) { 11903 switch (VT.getSimpleVT().SimpleTy) { 11904 case MVT::i8: return X86::MOV8rm; 11905 case MVT::i16: return X86::MOV16rm; 11906 case MVT::i32: return X86::MOV32rm; 11907 case MVT::i64: return X86::MOV64rm; 11908 default: 11909 break; 11910 } 11911 llvm_unreachable("Invalid operand size!"); 11912} 11913 11914// Get opcode of the non-atomic one from the specified atomic instruction. 11915static unsigned getNonAtomicOpcode(unsigned Opc) { 11916 switch (Opc) { 11917 case X86::ATOMAND8: return X86::AND8rr; 11918 case X86::ATOMAND16: return X86::AND16rr; 11919 case X86::ATOMAND32: return X86::AND32rr; 11920 case X86::ATOMAND64: return X86::AND64rr; 11921 case X86::ATOMOR8: return X86::OR8rr; 11922 case X86::ATOMOR16: return X86::OR16rr; 11923 case X86::ATOMOR32: return X86::OR32rr; 11924 case X86::ATOMOR64: return X86::OR64rr; 11925 case X86::ATOMXOR8: return X86::XOR8rr; 11926 case X86::ATOMXOR16: return X86::XOR16rr; 11927 case X86::ATOMXOR32: return X86::XOR32rr; 11928 case X86::ATOMXOR64: return X86::XOR64rr; 11929 } 11930 llvm_unreachable("Unhandled atomic-load-op opcode!"); 11931} 11932 11933// Get opcode of the non-atomic one from the specified atomic instruction with 11934// extra opcode. 11935static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, 11936 unsigned &ExtraOpc) { 11937 switch (Opc) { 11938 case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; 11939 case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; 11940 case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; 11941 case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; 11942 case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; 11943 case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; 11944 case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; 11945 case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; 11946 case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; 11947 case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; 11948 case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; 11949 case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; 11950 case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; 11951 case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; 11952 case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; 11953 case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; 11954 case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; 11955 case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; 11956 case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; 11957 case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; 11958 } 11959 llvm_unreachable("Unhandled atomic-load-op opcode!"); 11960} 11961 11962// Get opcode of the non-atomic one from the specified atomic instruction for 11963// 64-bit data type on 32-bit target. 11964static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { 11965 switch (Opc) { 11966 case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; 11967 case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; 11968 case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; 11969 case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; 11970 case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; 11971 case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; 11972 case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; 11973 case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; 11974 case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; 11975 case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; 11976 } 11977 llvm_unreachable("Unhandled atomic-load-op opcode!"); 11978} 11979 11980// Get opcode of the non-atomic one from the specified atomic instruction for 11981// 64-bit data type on 32-bit target with extra opcode. 11982static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, 11983 unsigned &HiOpc, 11984 unsigned &ExtraOpc) { 11985 switch (Opc) { 11986 case X86::ATOMNAND6432: 11987 ExtraOpc = X86::NOT32r; 11988 HiOpc = X86::AND32rr; 11989 return X86::AND32rr; 11990 } 11991 llvm_unreachable("Unhandled atomic-load-op opcode!"); 11992} 11993 11994// Get pseudo CMOV opcode from the specified data type. 11995static unsigned getPseudoCMOVOpc(EVT VT) { 11996 switch (VT.getSimpleVT().SimpleTy) { 11997 case MVT::i8: return X86::CMOV_GR8; 11998 case MVT::i16: return X86::CMOV_GR16; 11999 case MVT::i32: return X86::CMOV_GR32; 12000 default: 12001 break; 12002 } 12003 llvm_unreachable("Unknown CMOV opcode!"); 12004} 12005 12006// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. 12007// They will be translated into a spin-loop or compare-exchange loop from 12008// 12009// ... 12010// dst = atomic-fetch-op MI.addr, MI.val 12011// ... 12012// 12013// to 12014// 12015// ... 12016// EAX = LOAD MI.addr 12017// loop: 12018// t1 = OP MI.val, EAX 12019// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 12020// JNE loop 12021// sink: 12022// dst = EAX 12023// ... 12024MachineBasicBlock * 12025X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, 12026 MachineBasicBlock *MBB) const { 12027 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12028 DebugLoc DL = MI->getDebugLoc(); 12029 12030 MachineFunction *MF = MBB->getParent(); 12031 MachineRegisterInfo &MRI = MF->getRegInfo(); 12032 12033 const BasicBlock *BB = MBB->getBasicBlock(); 12034 MachineFunction::iterator I = MBB; 12035 ++I; 12036 12037 assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && 12038 "Unexpected number of operands"); 12039 12040 assert(MI->hasOneMemOperand() && 12041 "Expected atomic-load-op to have one memoperand"); 12042 12043 // Memory Reference 12044 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12045 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12046 12047 unsigned DstReg, SrcReg; 12048 unsigned MemOpndSlot; 12049 12050 unsigned CurOp = 0; 12051 12052 DstReg = MI->getOperand(CurOp++).getReg(); 12053 MemOpndSlot = CurOp; 12054 CurOp += X86::AddrNumOperands; 12055 SrcReg = MI->getOperand(CurOp++).getReg(); 12056 12057 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 12058 MVT::SimpleValueType VT = *RC->vt_begin(); 12059 unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); 12060 12061 unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); 12062 unsigned LOADOpc = getLoadOpcode(VT); 12063 12064 // For the atomic load-arith operator, we generate 12065 // 12066 // thisMBB: 12067 // EAX = LOAD [MI.addr] 12068 // mainMBB: 12069 // t1 = OP MI.val, EAX 12070 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 12071 // JNE mainMBB 12072 // sinkMBB: 12073 12074 MachineBasicBlock *thisMBB = MBB; 12075 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 12076 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 12077 MF->insert(I, mainMBB); 12078 MF->insert(I, sinkMBB); 12079 12080 MachineInstrBuilder MIB; 12081 12082 // Transfer the remainder of BB and its successor edges to sinkMBB. 12083 sinkMBB->splice(sinkMBB->begin(), MBB, 12084 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 12085 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 12086 12087 // thisMBB: 12088 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); 12089 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12090 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12091 MIB.setMemRefs(MMOBegin, MMOEnd); 12092 12093 thisMBB->addSuccessor(mainMBB); 12094 12095 // mainMBB: 12096 MachineBasicBlock *origMainMBB = mainMBB; 12097 mainMBB->addLiveIn(AccPhyReg); 12098 12099 // Copy AccPhyReg as it is used more than once. 12100 unsigned AccReg = MRI.createVirtualRegister(RC); 12101 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) 12102 .addReg(AccPhyReg); 12103 12104 unsigned t1 = MRI.createVirtualRegister(RC); 12105 unsigned Opc = MI->getOpcode(); 12106 switch (Opc) { 12107 default: 12108 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12109 case X86::ATOMAND8: 12110 case X86::ATOMAND16: 12111 case X86::ATOMAND32: 12112 case X86::ATOMAND64: 12113 case X86::ATOMOR8: 12114 case X86::ATOMOR16: 12115 case X86::ATOMOR32: 12116 case X86::ATOMOR64: 12117 case X86::ATOMXOR8: 12118 case X86::ATOMXOR16: 12119 case X86::ATOMXOR32: 12120 case X86::ATOMXOR64: { 12121 unsigned ARITHOpc = getNonAtomicOpcode(Opc); 12122 BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) 12123 .addReg(AccReg); 12124 break; 12125 } 12126 case X86::ATOMNAND8: 12127 case X86::ATOMNAND16: 12128 case X86::ATOMNAND32: 12129 case X86::ATOMNAND64: { 12130 unsigned t2 = MRI.createVirtualRegister(RC); 12131 unsigned NOTOpc; 12132 unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); 12133 BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) 12134 .addReg(AccReg); 12135 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); 12136 break; 12137 } 12138 case X86::ATOMMAX8: 12139 case X86::ATOMMAX16: 12140 case X86::ATOMMAX32: 12141 case X86::ATOMMAX64: 12142 case X86::ATOMMIN8: 12143 case X86::ATOMMIN16: 12144 case X86::ATOMMIN32: 12145 case X86::ATOMMIN64: 12146 case X86::ATOMUMAX8: 12147 case X86::ATOMUMAX16: 12148 case X86::ATOMUMAX32: 12149 case X86::ATOMUMAX64: 12150 case X86::ATOMUMIN8: 12151 case X86::ATOMUMIN16: 12152 case X86::ATOMUMIN32: 12153 case X86::ATOMUMIN64: { 12154 unsigned CMPOpc; 12155 unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); 12156 12157 BuildMI(mainMBB, DL, TII->get(CMPOpc)) 12158 .addReg(SrcReg) 12159 .addReg(AccReg); 12160 12161 if (Subtarget->hasCMov()) { 12162 if (VT != MVT::i8) { 12163 // Native support 12164 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) 12165 .addReg(SrcReg) 12166 .addReg(AccReg); 12167 } else { 12168 // Promote i8 to i32 to use CMOV32 12169 const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); 12170 unsigned SrcReg32 = MRI.createVirtualRegister(RC32); 12171 unsigned AccReg32 = MRI.createVirtualRegister(RC32); 12172 unsigned t2 = MRI.createVirtualRegister(RC32); 12173 12174 unsigned Undef = MRI.createVirtualRegister(RC32); 12175 BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); 12176 12177 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) 12178 .addReg(Undef) 12179 .addReg(SrcReg) 12180 .addImm(X86::sub_8bit); 12181 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) 12182 .addReg(Undef) 12183 .addReg(AccReg) 12184 .addImm(X86::sub_8bit); 12185 12186 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) 12187 .addReg(SrcReg32) 12188 .addReg(AccReg32); 12189 12190 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) 12191 .addReg(t2, 0, X86::sub_8bit); 12192 } 12193 } else { 12194 // Use pseudo select and lower them. 12195 assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && 12196 "Invalid atomic-load-op transformation!"); 12197 unsigned SelOpc = getPseudoCMOVOpc(VT); 12198 X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); 12199 assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); 12200 MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) 12201 .addReg(SrcReg).addReg(AccReg) 12202 .addImm(CC); 12203 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12204 } 12205 break; 12206 } 12207 } 12208 12209 // Copy AccPhyReg back from virtual register. 12210 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) 12211 .addReg(AccReg); 12212 12213 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 12214 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12215 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12216 MIB.addReg(t1); 12217 MIB.setMemRefs(MMOBegin, MMOEnd); 12218 12219 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 12220 12221 mainMBB->addSuccessor(origMainMBB); 12222 mainMBB->addSuccessor(sinkMBB); 12223 12224 // sinkMBB: 12225 sinkMBB->addLiveIn(AccPhyReg); 12226 12227 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12228 TII->get(TargetOpcode::COPY), DstReg) 12229 .addReg(AccPhyReg); 12230 12231 MI->eraseFromParent(); 12232 return sinkMBB; 12233} 12234 12235// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic 12236// instructions. They will be translated into a spin-loop or compare-exchange 12237// loop from 12238// 12239// ... 12240// dst = atomic-fetch-op MI.addr, MI.val 12241// ... 12242// 12243// to 12244// 12245// ... 12246// EAX = LOAD [MI.addr + 0] 12247// EDX = LOAD [MI.addr + 4] 12248// loop: 12249// EBX = OP MI.val.lo, EAX 12250// ECX = OP MI.val.hi, EDX 12251// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 12252// JNE loop 12253// sink: 12254// dst = EDX:EAX 12255// ... 12256MachineBasicBlock * 12257X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, 12258 MachineBasicBlock *MBB) const { 12259 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12260 DebugLoc DL = MI->getDebugLoc(); 12261 12262 MachineFunction *MF = MBB->getParent(); 12263 MachineRegisterInfo &MRI = MF->getRegInfo(); 12264 12265 const BasicBlock *BB = MBB->getBasicBlock(); 12266 MachineFunction::iterator I = MBB; 12267 ++I; 12268 12269 assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && 12270 "Unexpected number of operands"); 12271 12272 assert(MI->hasOneMemOperand() && 12273 "Expected atomic-load-op32 to have one memoperand"); 12274 12275 // Memory Reference 12276 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12277 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12278 12279 unsigned DstLoReg, DstHiReg; 12280 unsigned SrcLoReg, SrcHiReg; 12281 unsigned MemOpndSlot; 12282 12283 unsigned CurOp = 0; 12284 12285 DstLoReg = MI->getOperand(CurOp++).getReg(); 12286 DstHiReg = MI->getOperand(CurOp++).getReg(); 12287 MemOpndSlot = CurOp; 12288 CurOp += X86::AddrNumOperands; 12289 SrcLoReg = MI->getOperand(CurOp++).getReg(); 12290 SrcHiReg = MI->getOperand(CurOp++).getReg(); 12291 12292 const TargetRegisterClass *RC = &X86::GR32RegClass; 12293 const TargetRegisterClass *RC8 = &X86::GR8RegClass; 12294 12295 unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; 12296 unsigned LOADOpc = X86::MOV32rm; 12297 12298 // For the atomic load-arith operator, we generate 12299 // 12300 // thisMBB: 12301 // EAX = LOAD [MI.addr + 0] 12302 // EDX = LOAD [MI.addr + 4] 12303 // mainMBB: 12304 // EBX = OP MI.vallo, EAX 12305 // ECX = OP MI.valhi, EDX 12306 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 12307 // JNE mainMBB 12308 // sinkMBB: 12309 12310 MachineBasicBlock *thisMBB = MBB; 12311 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 12312 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 12313 MF->insert(I, mainMBB); 12314 MF->insert(I, sinkMBB); 12315 12316 MachineInstrBuilder MIB; 12317 12318 // Transfer the remainder of BB and its successor edges to sinkMBB. 12319 sinkMBB->splice(sinkMBB->begin(), MBB, 12320 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 12321 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 12322 12323 // thisMBB: 12324 // Lo 12325 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); 12326 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12327 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12328 MIB.setMemRefs(MMOBegin, MMOEnd); 12329 // Hi 12330 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); 12331 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 12332 if (i == X86::AddrDisp) 12333 MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) 12334 else 12335 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12336 } 12337 MIB.setMemRefs(MMOBegin, MMOEnd); 12338 12339 thisMBB->addSuccessor(mainMBB); 12340 12341 // mainMBB: 12342 MachineBasicBlock *origMainMBB = mainMBB; 12343 mainMBB->addLiveIn(X86::EAX); 12344 mainMBB->addLiveIn(X86::EDX); 12345 12346 // Copy EDX:EAX as they are used more than once. 12347 unsigned LoReg = MRI.createVirtualRegister(RC); 12348 unsigned HiReg = MRI.createVirtualRegister(RC); 12349 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); 12350 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); 12351 12352 unsigned t1L = MRI.createVirtualRegister(RC); 12353 unsigned t1H = MRI.createVirtualRegister(RC); 12354 12355 unsigned Opc = MI->getOpcode(); 12356 switch (Opc) { 12357 default: 12358 llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); 12359 case X86::ATOMAND6432: 12360 case X86::ATOMOR6432: 12361 case X86::ATOMXOR6432: 12362 case X86::ATOMADD6432: 12363 case X86::ATOMSUB6432: { 12364 unsigned HiOpc; 12365 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 12366 BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg).addReg(LoReg); 12367 BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg).addReg(HiReg); 12368 break; 12369 } 12370 case X86::ATOMNAND6432: { 12371 unsigned HiOpc, NOTOpc; 12372 unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); 12373 unsigned t2L = MRI.createVirtualRegister(RC); 12374 unsigned t2H = MRI.createVirtualRegister(RC); 12375 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); 12376 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); 12377 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); 12378 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); 12379 break; 12380 } 12381 case X86::ATOMMAX6432: 12382 case X86::ATOMMIN6432: 12383 case X86::ATOMUMAX6432: 12384 case X86::ATOMUMIN6432: { 12385 unsigned HiOpc; 12386 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 12387 unsigned cL = MRI.createVirtualRegister(RC8); 12388 unsigned cH = MRI.createVirtualRegister(RC8); 12389 unsigned cL32 = MRI.createVirtualRegister(RC); 12390 unsigned cH32 = MRI.createVirtualRegister(RC); 12391 unsigned cc = MRI.createVirtualRegister(RC); 12392 // cl := cmp src_lo, lo 12393 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 12394 .addReg(SrcLoReg).addReg(LoReg); 12395 BuildMI(mainMBB, DL, TII->get(LoOpc), cL); 12396 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); 12397 // ch := cmp src_hi, hi 12398 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 12399 .addReg(SrcHiReg).addReg(HiReg); 12400 BuildMI(mainMBB, DL, TII->get(HiOpc), cH); 12401 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); 12402 // cc := if (src_hi == hi) ? cl : ch; 12403 if (Subtarget->hasCMov()) { 12404 BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) 12405 .addReg(cH32).addReg(cL32); 12406 } else { 12407 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) 12408 .addReg(cH32).addReg(cL32) 12409 .addImm(X86::COND_E); 12410 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12411 } 12412 BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); 12413 if (Subtarget->hasCMov()) { 12414 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) 12415 .addReg(SrcLoReg).addReg(LoReg); 12416 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) 12417 .addReg(SrcHiReg).addReg(HiReg); 12418 } else { 12419 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) 12420 .addReg(SrcLoReg).addReg(LoReg) 12421 .addImm(X86::COND_NE); 12422 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12423 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) 12424 .addReg(SrcHiReg).addReg(HiReg) 12425 .addImm(X86::COND_NE); 12426 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12427 } 12428 break; 12429 } 12430 case X86::ATOMSWAP6432: { 12431 unsigned HiOpc; 12432 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 12433 BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); 12434 BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); 12435 break; 12436 } 12437 } 12438 12439 // Copy EDX:EAX back from HiReg:LoReg 12440 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); 12441 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); 12442 // Copy ECX:EBX from t1H:t1L 12443 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); 12444 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); 12445 12446 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 12447 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12448 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12449 MIB.setMemRefs(MMOBegin, MMOEnd); 12450 12451 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 12452 12453 mainMBB->addSuccessor(origMainMBB); 12454 mainMBB->addSuccessor(sinkMBB); 12455 12456 // sinkMBB: 12457 sinkMBB->addLiveIn(X86::EAX); 12458 sinkMBB->addLiveIn(X86::EDX); 12459 12460 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12461 TII->get(TargetOpcode::COPY), DstLoReg) 12462 .addReg(X86::EAX); 12463 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12464 TII->get(TargetOpcode::COPY), DstHiReg) 12465 .addReg(X86::EDX); 12466 12467 MI->eraseFromParent(); 12468 return sinkMBB; 12469} 12470 12471// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 12472// or XMM0_V32I8 in AVX all of this code can be replaced with that 12473// in the .td file. 12474MachineBasicBlock * 12475X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 12476 unsigned numArgs, bool memArg) const { 12477 assert(Subtarget->hasSSE42() && 12478 "Target must have SSE4.2 or AVX features enabled"); 12479 12480 DebugLoc dl = MI->getDebugLoc(); 12481 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12482 unsigned Opc; 12483 if (!Subtarget->hasAVX()) { 12484 if (memArg) 12485 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 12486 else 12487 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 12488 } else { 12489 if (memArg) 12490 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 12491 else 12492 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 12493 } 12494 12495 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 12496 for (unsigned i = 0; i < numArgs; ++i) { 12497 MachineOperand &Op = MI->getOperand(i+1); 12498 if (!(Op.isReg() && Op.isImplicit())) 12499 MIB.addOperand(Op); 12500 } 12501 BuildMI(*BB, MI, dl, 12502 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 12503 .addReg(X86::XMM0); 12504 12505 MI->eraseFromParent(); 12506 return BB; 12507} 12508 12509MachineBasicBlock * 12510X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 12511 DebugLoc dl = MI->getDebugLoc(); 12512 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12513 12514 // Address into RAX/EAX, other two args into ECX, EDX. 12515 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 12516 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 12517 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 12518 for (int i = 0; i < X86::AddrNumOperands; ++i) 12519 MIB.addOperand(MI->getOperand(i)); 12520 12521 unsigned ValOps = X86::AddrNumOperands; 12522 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 12523 .addReg(MI->getOperand(ValOps).getReg()); 12524 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 12525 .addReg(MI->getOperand(ValOps+1).getReg()); 12526 12527 // The instruction doesn't actually take any operands though. 12528 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 12529 12530 MI->eraseFromParent(); // The pseudo is gone now. 12531 return BB; 12532} 12533 12534MachineBasicBlock * 12535X86TargetLowering::EmitVAARG64WithCustomInserter( 12536 MachineInstr *MI, 12537 MachineBasicBlock *MBB) const { 12538 // Emit va_arg instruction on X86-64. 12539 12540 // Operands to this pseudo-instruction: 12541 // 0 ) Output : destination address (reg) 12542 // 1-5) Input : va_list address (addr, i64mem) 12543 // 6 ) ArgSize : Size (in bytes) of vararg type 12544 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 12545 // 8 ) Align : Alignment of type 12546 // 9 ) EFLAGS (implicit-def) 12547 12548 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 12549 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 12550 12551 unsigned DestReg = MI->getOperand(0).getReg(); 12552 MachineOperand &Base = MI->getOperand(1); 12553 MachineOperand &Scale = MI->getOperand(2); 12554 MachineOperand &Index = MI->getOperand(3); 12555 MachineOperand &Disp = MI->getOperand(4); 12556 MachineOperand &Segment = MI->getOperand(5); 12557 unsigned ArgSize = MI->getOperand(6).getImm(); 12558 unsigned ArgMode = MI->getOperand(7).getImm(); 12559 unsigned Align = MI->getOperand(8).getImm(); 12560 12561 // Memory Reference 12562 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 12563 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12564 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12565 12566 // Machine Information 12567 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12568 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 12569 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 12570 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 12571 DebugLoc DL = MI->getDebugLoc(); 12572 12573 // struct va_list { 12574 // i32 gp_offset 12575 // i32 fp_offset 12576 // i64 overflow_area (address) 12577 // i64 reg_save_area (address) 12578 // } 12579 // sizeof(va_list) = 24 12580 // alignment(va_list) = 8 12581 12582 unsigned TotalNumIntRegs = 6; 12583 unsigned TotalNumXMMRegs = 8; 12584 bool UseGPOffset = (ArgMode == 1); 12585 bool UseFPOffset = (ArgMode == 2); 12586 unsigned MaxOffset = TotalNumIntRegs * 8 + 12587 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 12588 12589 /* Align ArgSize to a multiple of 8 */ 12590 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 12591 bool NeedsAlign = (Align > 8); 12592 12593 MachineBasicBlock *thisMBB = MBB; 12594 MachineBasicBlock *overflowMBB; 12595 MachineBasicBlock *offsetMBB; 12596 MachineBasicBlock *endMBB; 12597 12598 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 12599 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 12600 unsigned OffsetReg = 0; 12601 12602 if (!UseGPOffset && !UseFPOffset) { 12603 // If we only pull from the overflow region, we don't create a branch. 12604 // We don't need to alter control flow. 12605 OffsetDestReg = 0; // unused 12606 OverflowDestReg = DestReg; 12607 12608 offsetMBB = NULL; 12609 overflowMBB = thisMBB; 12610 endMBB = thisMBB; 12611 } else { 12612 // First emit code to check if gp_offset (or fp_offset) is below the bound. 12613 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 12614 // If not, pull from overflow_area. (branch to overflowMBB) 12615 // 12616 // thisMBB 12617 // | . 12618 // | . 12619 // offsetMBB overflowMBB 12620 // | . 12621 // | . 12622 // endMBB 12623 12624 // Registers for the PHI in endMBB 12625 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 12626 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 12627 12628 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 12629 MachineFunction *MF = MBB->getParent(); 12630 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12631 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12632 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12633 12634 MachineFunction::iterator MBBIter = MBB; 12635 ++MBBIter; 12636 12637 // Insert the new basic blocks 12638 MF->insert(MBBIter, offsetMBB); 12639 MF->insert(MBBIter, overflowMBB); 12640 MF->insert(MBBIter, endMBB); 12641 12642 // Transfer the remainder of MBB and its successor edges to endMBB. 12643 endMBB->splice(endMBB->begin(), thisMBB, 12644 llvm::next(MachineBasicBlock::iterator(MI)), 12645 thisMBB->end()); 12646 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 12647 12648 // Make offsetMBB and overflowMBB successors of thisMBB 12649 thisMBB->addSuccessor(offsetMBB); 12650 thisMBB->addSuccessor(overflowMBB); 12651 12652 // endMBB is a successor of both offsetMBB and overflowMBB 12653 offsetMBB->addSuccessor(endMBB); 12654 overflowMBB->addSuccessor(endMBB); 12655 12656 // Load the offset value into a register 12657 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 12658 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 12659 .addOperand(Base) 12660 .addOperand(Scale) 12661 .addOperand(Index) 12662 .addDisp(Disp, UseFPOffset ? 4 : 0) 12663 .addOperand(Segment) 12664 .setMemRefs(MMOBegin, MMOEnd); 12665 12666 // Check if there is enough room left to pull this argument. 12667 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 12668 .addReg(OffsetReg) 12669 .addImm(MaxOffset + 8 - ArgSizeA8); 12670 12671 // Branch to "overflowMBB" if offset >= max 12672 // Fall through to "offsetMBB" otherwise 12673 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 12674 .addMBB(overflowMBB); 12675 } 12676 12677 // In offsetMBB, emit code to use the reg_save_area. 12678 if (offsetMBB) { 12679 assert(OffsetReg != 0); 12680 12681 // Read the reg_save_area address. 12682 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 12683 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 12684 .addOperand(Base) 12685 .addOperand(Scale) 12686 .addOperand(Index) 12687 .addDisp(Disp, 16) 12688 .addOperand(Segment) 12689 .setMemRefs(MMOBegin, MMOEnd); 12690 12691 // Zero-extend the offset 12692 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 12693 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 12694 .addImm(0) 12695 .addReg(OffsetReg) 12696 .addImm(X86::sub_32bit); 12697 12698 // Add the offset to the reg_save_area to get the final address. 12699 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 12700 .addReg(OffsetReg64) 12701 .addReg(RegSaveReg); 12702 12703 // Compute the offset for the next argument 12704 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 12705 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 12706 .addReg(OffsetReg) 12707 .addImm(UseFPOffset ? 16 : 8); 12708 12709 // Store it back into the va_list. 12710 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 12711 .addOperand(Base) 12712 .addOperand(Scale) 12713 .addOperand(Index) 12714 .addDisp(Disp, UseFPOffset ? 4 : 0) 12715 .addOperand(Segment) 12716 .addReg(NextOffsetReg) 12717 .setMemRefs(MMOBegin, MMOEnd); 12718 12719 // Jump to endMBB 12720 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 12721 .addMBB(endMBB); 12722 } 12723 12724 // 12725 // Emit code to use overflow area 12726 // 12727 12728 // Load the overflow_area address into a register. 12729 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 12730 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 12731 .addOperand(Base) 12732 .addOperand(Scale) 12733 .addOperand(Index) 12734 .addDisp(Disp, 8) 12735 .addOperand(Segment) 12736 .setMemRefs(MMOBegin, MMOEnd); 12737 12738 // If we need to align it, do so. Otherwise, just copy the address 12739 // to OverflowDestReg. 12740 if (NeedsAlign) { 12741 // Align the overflow address 12742 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 12743 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 12744 12745 // aligned_addr = (addr + (align-1)) & ~(align-1) 12746 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 12747 .addReg(OverflowAddrReg) 12748 .addImm(Align-1); 12749 12750 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 12751 .addReg(TmpReg) 12752 .addImm(~(uint64_t)(Align-1)); 12753 } else { 12754 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 12755 .addReg(OverflowAddrReg); 12756 } 12757 12758 // Compute the next overflow address after this argument. 12759 // (the overflow address should be kept 8-byte aligned) 12760 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 12761 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 12762 .addReg(OverflowDestReg) 12763 .addImm(ArgSizeA8); 12764 12765 // Store the new overflow address. 12766 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 12767 .addOperand(Base) 12768 .addOperand(Scale) 12769 .addOperand(Index) 12770 .addDisp(Disp, 8) 12771 .addOperand(Segment) 12772 .addReg(NextAddrReg) 12773 .setMemRefs(MMOBegin, MMOEnd); 12774 12775 // If we branched, emit the PHI to the front of endMBB. 12776 if (offsetMBB) { 12777 BuildMI(*endMBB, endMBB->begin(), DL, 12778 TII->get(X86::PHI), DestReg) 12779 .addReg(OffsetDestReg).addMBB(offsetMBB) 12780 .addReg(OverflowDestReg).addMBB(overflowMBB); 12781 } 12782 12783 // Erase the pseudo instruction 12784 MI->eraseFromParent(); 12785 12786 return endMBB; 12787} 12788 12789MachineBasicBlock * 12790X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 12791 MachineInstr *MI, 12792 MachineBasicBlock *MBB) const { 12793 // Emit code to save XMM registers to the stack. The ABI says that the 12794 // number of registers to save is given in %al, so it's theoretically 12795 // possible to do an indirect jump trick to avoid saving all of them, 12796 // however this code takes a simpler approach and just executes all 12797 // of the stores if %al is non-zero. It's less code, and it's probably 12798 // easier on the hardware branch predictor, and stores aren't all that 12799 // expensive anyway. 12800 12801 // Create the new basic blocks. One block contains all the XMM stores, 12802 // and one block is the final destination regardless of whether any 12803 // stores were performed. 12804 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 12805 MachineFunction *F = MBB->getParent(); 12806 MachineFunction::iterator MBBIter = MBB; 12807 ++MBBIter; 12808 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 12809 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 12810 F->insert(MBBIter, XMMSaveMBB); 12811 F->insert(MBBIter, EndMBB); 12812 12813 // Transfer the remainder of MBB and its successor edges to EndMBB. 12814 EndMBB->splice(EndMBB->begin(), MBB, 12815 llvm::next(MachineBasicBlock::iterator(MI)), 12816 MBB->end()); 12817 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 12818 12819 // The original block will now fall through to the XMM save block. 12820 MBB->addSuccessor(XMMSaveMBB); 12821 // The XMMSaveMBB will fall through to the end block. 12822 XMMSaveMBB->addSuccessor(EndMBB); 12823 12824 // Now add the instructions. 12825 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12826 DebugLoc DL = MI->getDebugLoc(); 12827 12828 unsigned CountReg = MI->getOperand(0).getReg(); 12829 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 12830 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 12831 12832 if (!Subtarget->isTargetWin64()) { 12833 // If %al is 0, branch around the XMM save block. 12834 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 12835 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 12836 MBB->addSuccessor(EndMBB); 12837 } 12838 12839 unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; 12840 // In the XMM save block, save all the XMM argument registers. 12841 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 12842 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 12843 MachineMemOperand *MMO = 12844 F->getMachineMemOperand( 12845 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 12846 MachineMemOperand::MOStore, 12847 /*Size=*/16, /*Align=*/16); 12848 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 12849 .addFrameIndex(RegSaveFrameIndex) 12850 .addImm(/*Scale=*/1) 12851 .addReg(/*IndexReg=*/0) 12852 .addImm(/*Disp=*/Offset) 12853 .addReg(/*Segment=*/0) 12854 .addReg(MI->getOperand(i).getReg()) 12855 .addMemOperand(MMO); 12856 } 12857 12858 MI->eraseFromParent(); // The pseudo instruction is gone now. 12859 12860 return EndMBB; 12861} 12862 12863// The EFLAGS operand of SelectItr might be missing a kill marker 12864// because there were multiple uses of EFLAGS, and ISel didn't know 12865// which to mark. Figure out whether SelectItr should have had a 12866// kill marker, and set it if it should. Returns the correct kill 12867// marker value. 12868static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 12869 MachineBasicBlock* BB, 12870 const TargetRegisterInfo* TRI) { 12871 // Scan forward through BB for a use/def of EFLAGS. 12872 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 12873 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 12874 const MachineInstr& mi = *miI; 12875 if (mi.readsRegister(X86::EFLAGS)) 12876 return false; 12877 if (mi.definesRegister(X86::EFLAGS)) 12878 break; // Should have kill-flag - update below. 12879 } 12880 12881 // If we hit the end of the block, check whether EFLAGS is live into a 12882 // successor. 12883 if (miI == BB->end()) { 12884 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 12885 sEnd = BB->succ_end(); 12886 sItr != sEnd; ++sItr) { 12887 MachineBasicBlock* succ = *sItr; 12888 if (succ->isLiveIn(X86::EFLAGS)) 12889 return false; 12890 } 12891 } 12892 12893 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 12894 // out. SelectMI should have a kill flag on EFLAGS. 12895 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 12896 return true; 12897} 12898 12899MachineBasicBlock * 12900X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 12901 MachineBasicBlock *BB) const { 12902 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12903 DebugLoc DL = MI->getDebugLoc(); 12904 12905 // To "insert" a SELECT_CC instruction, we actually have to insert the 12906 // diamond control-flow pattern. The incoming instruction knows the 12907 // destination vreg to set, the condition code register to branch on, the 12908 // true/false values to select between, and a branch opcode to use. 12909 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12910 MachineFunction::iterator It = BB; 12911 ++It; 12912 12913 // thisMBB: 12914 // ... 12915 // TrueVal = ... 12916 // cmpTY ccX, r1, r2 12917 // bCC copy1MBB 12918 // fallthrough --> copy0MBB 12919 MachineBasicBlock *thisMBB = BB; 12920 MachineFunction *F = BB->getParent(); 12921 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 12922 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 12923 F->insert(It, copy0MBB); 12924 F->insert(It, sinkMBB); 12925 12926 // If the EFLAGS register isn't dead in the terminator, then claim that it's 12927 // live into the sink and copy blocks. 12928 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12929 if (!MI->killsRegister(X86::EFLAGS) && 12930 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 12931 copy0MBB->addLiveIn(X86::EFLAGS); 12932 sinkMBB->addLiveIn(X86::EFLAGS); 12933 } 12934 12935 // Transfer the remainder of BB and its successor edges to sinkMBB. 12936 sinkMBB->splice(sinkMBB->begin(), BB, 12937 llvm::next(MachineBasicBlock::iterator(MI)), 12938 BB->end()); 12939 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 12940 12941 // Add the true and fallthrough blocks as its successors. 12942 BB->addSuccessor(copy0MBB); 12943 BB->addSuccessor(sinkMBB); 12944 12945 // Create the conditional branch instruction. 12946 unsigned Opc = 12947 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 12948 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 12949 12950 // copy0MBB: 12951 // %FalseValue = ... 12952 // # fallthrough to sinkMBB 12953 copy0MBB->addSuccessor(sinkMBB); 12954 12955 // sinkMBB: 12956 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 12957 // ... 12958 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12959 TII->get(X86::PHI), MI->getOperand(0).getReg()) 12960 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 12961 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 12962 12963 MI->eraseFromParent(); // The pseudo instruction is gone now. 12964 return sinkMBB; 12965} 12966 12967MachineBasicBlock * 12968X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 12969 bool Is64Bit) const { 12970 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12971 DebugLoc DL = MI->getDebugLoc(); 12972 MachineFunction *MF = BB->getParent(); 12973 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12974 12975 assert(getTargetMachine().Options.EnableSegmentedStacks); 12976 12977 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 12978 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 12979 12980 // BB: 12981 // ... [Till the alloca] 12982 // If stacklet is not large enough, jump to mallocMBB 12983 // 12984 // bumpMBB: 12985 // Allocate by subtracting from RSP 12986 // Jump to continueMBB 12987 // 12988 // mallocMBB: 12989 // Allocate by call to runtime 12990 // 12991 // continueMBB: 12992 // ... 12993 // [rest of original BB] 12994 // 12995 12996 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12997 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12998 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12999 13000 MachineRegisterInfo &MRI = MF->getRegInfo(); 13001 const TargetRegisterClass *AddrRegClass = 13002 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 13003 13004 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 13005 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 13006 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 13007 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 13008 sizeVReg = MI->getOperand(1).getReg(), 13009 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 13010 13011 MachineFunction::iterator MBBIter = BB; 13012 ++MBBIter; 13013 13014 MF->insert(MBBIter, bumpMBB); 13015 MF->insert(MBBIter, mallocMBB); 13016 MF->insert(MBBIter, continueMBB); 13017 13018 continueMBB->splice(continueMBB->begin(), BB, llvm::next 13019 (MachineBasicBlock::iterator(MI)), BB->end()); 13020 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 13021 13022 // Add code to the main basic block to check if the stack limit has been hit, 13023 // and if so, jump to mallocMBB otherwise to bumpMBB. 13024 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 13025 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 13026 .addReg(tmpSPVReg).addReg(sizeVReg); 13027 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 13028 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 13029 .addReg(SPLimitVReg); 13030 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 13031 13032 // bumpMBB simply decreases the stack pointer, since we know the current 13033 // stacklet has enough space. 13034 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 13035 .addReg(SPLimitVReg); 13036 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 13037 .addReg(SPLimitVReg); 13038 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 13039 13040 // Calls into a routine in libgcc to allocate more space from the heap. 13041 const uint32_t *RegMask = 13042 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 13043 if (Is64Bit) { 13044 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 13045 .addReg(sizeVReg); 13046 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 13047 .addExternalSymbol("__morestack_allocate_stack_space") 13048 .addRegMask(RegMask) 13049 .addReg(X86::RDI, RegState::Implicit) 13050 .addReg(X86::RAX, RegState::ImplicitDefine); 13051 } else { 13052 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 13053 .addImm(12); 13054 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 13055 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 13056 .addExternalSymbol("__morestack_allocate_stack_space") 13057 .addRegMask(RegMask) 13058 .addReg(X86::EAX, RegState::ImplicitDefine); 13059 } 13060 13061 if (!Is64Bit) 13062 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 13063 .addImm(16); 13064 13065 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 13066 .addReg(Is64Bit ? X86::RAX : X86::EAX); 13067 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 13068 13069 // Set up the CFG correctly. 13070 BB->addSuccessor(bumpMBB); 13071 BB->addSuccessor(mallocMBB); 13072 mallocMBB->addSuccessor(continueMBB); 13073 bumpMBB->addSuccessor(continueMBB); 13074 13075 // Take care of the PHI nodes. 13076 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 13077 MI->getOperand(0).getReg()) 13078 .addReg(mallocPtrVReg).addMBB(mallocMBB) 13079 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 13080 13081 // Delete the original pseudo instruction. 13082 MI->eraseFromParent(); 13083 13084 // And we're done. 13085 return continueMBB; 13086} 13087 13088MachineBasicBlock * 13089X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 13090 MachineBasicBlock *BB) const { 13091 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13092 DebugLoc DL = MI->getDebugLoc(); 13093 13094 assert(!Subtarget->isTargetEnvMacho()); 13095 13096 // The lowering is pretty easy: we're just emitting the call to _alloca. The 13097 // non-trivial part is impdef of ESP. 13098 13099 if (Subtarget->isTargetWin64()) { 13100 if (Subtarget->isTargetCygMing()) { 13101 // ___chkstk(Mingw64): 13102 // Clobbers R10, R11, RAX and EFLAGS. 13103 // Updates RSP. 13104 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 13105 .addExternalSymbol("___chkstk") 13106 .addReg(X86::RAX, RegState::Implicit) 13107 .addReg(X86::RSP, RegState::Implicit) 13108 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 13109 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 13110 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13111 } else { 13112 // __chkstk(MSVCRT): does not update stack pointer. 13113 // Clobbers R10, R11 and EFLAGS. 13114 // FIXME: RAX(allocated size) might be reused and not killed. 13115 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 13116 .addExternalSymbol("__chkstk") 13117 .addReg(X86::RAX, RegState::Implicit) 13118 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13119 // RAX has the offset to subtracted from RSP. 13120 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 13121 .addReg(X86::RSP) 13122 .addReg(X86::RAX); 13123 } 13124 } else { 13125 const char *StackProbeSymbol = 13126 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 13127 13128 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 13129 .addExternalSymbol(StackProbeSymbol) 13130 .addReg(X86::EAX, RegState::Implicit) 13131 .addReg(X86::ESP, RegState::Implicit) 13132 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 13133 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 13134 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13135 } 13136 13137 MI->eraseFromParent(); // The pseudo instruction is gone now. 13138 return BB; 13139} 13140 13141MachineBasicBlock * 13142X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 13143 MachineBasicBlock *BB) const { 13144 // This is pretty easy. We're taking the value that we received from 13145 // our load from the relocation, sticking it in either RDI (x86-64) 13146 // or EAX and doing an indirect call. The return value will then 13147 // be in the normal return register. 13148 const X86InstrInfo *TII 13149 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 13150 DebugLoc DL = MI->getDebugLoc(); 13151 MachineFunction *F = BB->getParent(); 13152 13153 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 13154 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 13155 13156 // Get a register mask for the lowered call. 13157 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 13158 // proper register mask. 13159 const uint32_t *RegMask = 13160 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 13161 if (Subtarget->is64Bit()) { 13162 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13163 TII->get(X86::MOV64rm), X86::RDI) 13164 .addReg(X86::RIP) 13165 .addImm(0).addReg(0) 13166 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13167 MI->getOperand(3).getTargetFlags()) 13168 .addReg(0); 13169 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 13170 addDirectMem(MIB, X86::RDI); 13171 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 13172 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 13173 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13174 TII->get(X86::MOV32rm), X86::EAX) 13175 .addReg(0) 13176 .addImm(0).addReg(0) 13177 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13178 MI->getOperand(3).getTargetFlags()) 13179 .addReg(0); 13180 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 13181 addDirectMem(MIB, X86::EAX); 13182 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 13183 } else { 13184 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13185 TII->get(X86::MOV32rm), X86::EAX) 13186 .addReg(TII->getGlobalBaseReg(F)) 13187 .addImm(0).addReg(0) 13188 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13189 MI->getOperand(3).getTargetFlags()) 13190 .addReg(0); 13191 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 13192 addDirectMem(MIB, X86::EAX); 13193 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 13194 } 13195 13196 MI->eraseFromParent(); // The pseudo instruction is gone now. 13197 return BB; 13198} 13199 13200MachineBasicBlock * 13201X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 13202 MachineBasicBlock *BB) const { 13203 switch (MI->getOpcode()) { 13204 default: llvm_unreachable("Unexpected instr type to insert"); 13205 case X86::TAILJMPd64: 13206 case X86::TAILJMPr64: 13207 case X86::TAILJMPm64: 13208 llvm_unreachable("TAILJMP64 would not be touched here."); 13209 case X86::TCRETURNdi64: 13210 case X86::TCRETURNri64: 13211 case X86::TCRETURNmi64: 13212 return BB; 13213 case X86::WIN_ALLOCA: 13214 return EmitLoweredWinAlloca(MI, BB); 13215 case X86::SEG_ALLOCA_32: 13216 return EmitLoweredSegAlloca(MI, BB, false); 13217 case X86::SEG_ALLOCA_64: 13218 return EmitLoweredSegAlloca(MI, BB, true); 13219 case X86::TLSCall_32: 13220 case X86::TLSCall_64: 13221 return EmitLoweredTLSCall(MI, BB); 13222 case X86::CMOV_GR8: 13223 case X86::CMOV_FR32: 13224 case X86::CMOV_FR64: 13225 case X86::CMOV_V4F32: 13226 case X86::CMOV_V2F64: 13227 case X86::CMOV_V2I64: 13228 case X86::CMOV_V8F32: 13229 case X86::CMOV_V4F64: 13230 case X86::CMOV_V4I64: 13231 case X86::CMOV_GR16: 13232 case X86::CMOV_GR32: 13233 case X86::CMOV_RFP32: 13234 case X86::CMOV_RFP64: 13235 case X86::CMOV_RFP80: 13236 return EmitLoweredSelect(MI, BB); 13237 13238 case X86::FP32_TO_INT16_IN_MEM: 13239 case X86::FP32_TO_INT32_IN_MEM: 13240 case X86::FP32_TO_INT64_IN_MEM: 13241 case X86::FP64_TO_INT16_IN_MEM: 13242 case X86::FP64_TO_INT32_IN_MEM: 13243 case X86::FP64_TO_INT64_IN_MEM: 13244 case X86::FP80_TO_INT16_IN_MEM: 13245 case X86::FP80_TO_INT32_IN_MEM: 13246 case X86::FP80_TO_INT64_IN_MEM: { 13247 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13248 DebugLoc DL = MI->getDebugLoc(); 13249 13250 // Change the floating point control register to use "round towards zero" 13251 // mode when truncating to an integer value. 13252 MachineFunction *F = BB->getParent(); 13253 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 13254 addFrameReference(BuildMI(*BB, MI, DL, 13255 TII->get(X86::FNSTCW16m)), CWFrameIdx); 13256 13257 // Load the old value of the high byte of the control word... 13258 unsigned OldCW = 13259 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 13260 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 13261 CWFrameIdx); 13262 13263 // Set the high part to be round to zero... 13264 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 13265 .addImm(0xC7F); 13266 13267 // Reload the modified control word now... 13268 addFrameReference(BuildMI(*BB, MI, DL, 13269 TII->get(X86::FLDCW16m)), CWFrameIdx); 13270 13271 // Restore the memory image of control word to original value 13272 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 13273 .addReg(OldCW); 13274 13275 // Get the X86 opcode to use. 13276 unsigned Opc; 13277 switch (MI->getOpcode()) { 13278 default: llvm_unreachable("illegal opcode!"); 13279 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 13280 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 13281 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 13282 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 13283 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 13284 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 13285 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 13286 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 13287 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 13288 } 13289 13290 X86AddressMode AM; 13291 MachineOperand &Op = MI->getOperand(0); 13292 if (Op.isReg()) { 13293 AM.BaseType = X86AddressMode::RegBase; 13294 AM.Base.Reg = Op.getReg(); 13295 } else { 13296 AM.BaseType = X86AddressMode::FrameIndexBase; 13297 AM.Base.FrameIndex = Op.getIndex(); 13298 } 13299 Op = MI->getOperand(1); 13300 if (Op.isImm()) 13301 AM.Scale = Op.getImm(); 13302 Op = MI->getOperand(2); 13303 if (Op.isImm()) 13304 AM.IndexReg = Op.getImm(); 13305 Op = MI->getOperand(3); 13306 if (Op.isGlobal()) { 13307 AM.GV = Op.getGlobal(); 13308 } else { 13309 AM.Disp = Op.getImm(); 13310 } 13311 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 13312 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 13313 13314 // Reload the original control word now. 13315 addFrameReference(BuildMI(*BB, MI, DL, 13316 TII->get(X86::FLDCW16m)), CWFrameIdx); 13317 13318 MI->eraseFromParent(); // The pseudo instruction is gone now. 13319 return BB; 13320 } 13321 // String/text processing lowering. 13322 case X86::PCMPISTRM128REG: 13323 case X86::VPCMPISTRM128REG: 13324 case X86::PCMPISTRM128MEM: 13325 case X86::VPCMPISTRM128MEM: 13326 case X86::PCMPESTRM128REG: 13327 case X86::VPCMPESTRM128REG: 13328 case X86::PCMPESTRM128MEM: 13329 case X86::VPCMPESTRM128MEM: { 13330 unsigned NumArgs; 13331 bool MemArg; 13332 switch (MI->getOpcode()) { 13333 default: llvm_unreachable("illegal opcode!"); 13334 case X86::PCMPISTRM128REG: 13335 case X86::VPCMPISTRM128REG: 13336 NumArgs = 3; MemArg = false; break; 13337 case X86::PCMPISTRM128MEM: 13338 case X86::VPCMPISTRM128MEM: 13339 NumArgs = 3; MemArg = true; break; 13340 case X86::PCMPESTRM128REG: 13341 case X86::VPCMPESTRM128REG: 13342 NumArgs = 5; MemArg = false; break; 13343 case X86::PCMPESTRM128MEM: 13344 case X86::VPCMPESTRM128MEM: 13345 NumArgs = 5; MemArg = true; break; 13346 } 13347 return EmitPCMP(MI, BB, NumArgs, MemArg); 13348 } 13349 13350 // Thread synchronization. 13351 case X86::MONITOR: 13352 return EmitMonitor(MI, BB); 13353 13354 // Atomic Lowering. 13355 case X86::ATOMAND8: 13356 case X86::ATOMAND16: 13357 case X86::ATOMAND32: 13358 case X86::ATOMAND64: 13359 // Fall through 13360 case X86::ATOMOR8: 13361 case X86::ATOMOR16: 13362 case X86::ATOMOR32: 13363 case X86::ATOMOR64: 13364 // Fall through 13365 case X86::ATOMXOR16: 13366 case X86::ATOMXOR8: 13367 case X86::ATOMXOR32: 13368 case X86::ATOMXOR64: 13369 // Fall through 13370 case X86::ATOMNAND8: 13371 case X86::ATOMNAND16: 13372 case X86::ATOMNAND32: 13373 case X86::ATOMNAND64: 13374 // Fall through 13375 case X86::ATOMMAX8: 13376 case X86::ATOMMAX16: 13377 case X86::ATOMMAX32: 13378 case X86::ATOMMAX64: 13379 // Fall through 13380 case X86::ATOMMIN8: 13381 case X86::ATOMMIN16: 13382 case X86::ATOMMIN32: 13383 case X86::ATOMMIN64: 13384 // Fall through 13385 case X86::ATOMUMAX8: 13386 case X86::ATOMUMAX16: 13387 case X86::ATOMUMAX32: 13388 case X86::ATOMUMAX64: 13389 // Fall through 13390 case X86::ATOMUMIN8: 13391 case X86::ATOMUMIN16: 13392 case X86::ATOMUMIN32: 13393 case X86::ATOMUMIN64: 13394 return EmitAtomicLoadArith(MI, BB); 13395 13396 // This group does 64-bit operations on a 32-bit host. 13397 case X86::ATOMAND6432: 13398 case X86::ATOMOR6432: 13399 case X86::ATOMXOR6432: 13400 case X86::ATOMNAND6432: 13401 case X86::ATOMADD6432: 13402 case X86::ATOMSUB6432: 13403 case X86::ATOMMAX6432: 13404 case X86::ATOMMIN6432: 13405 case X86::ATOMUMAX6432: 13406 case X86::ATOMUMIN6432: 13407 case X86::ATOMSWAP6432: 13408 return EmitAtomicLoadArith6432(MI, BB); 13409 13410 case X86::VASTART_SAVE_XMM_REGS: 13411 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 13412 13413 case X86::VAARG_64: 13414 return EmitVAARG64WithCustomInserter(MI, BB); 13415 } 13416} 13417 13418//===----------------------------------------------------------------------===// 13419// X86 Optimization Hooks 13420//===----------------------------------------------------------------------===// 13421 13422void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 13423 APInt &KnownZero, 13424 APInt &KnownOne, 13425 const SelectionDAG &DAG, 13426 unsigned Depth) const { 13427 unsigned BitWidth = KnownZero.getBitWidth(); 13428 unsigned Opc = Op.getOpcode(); 13429 assert((Opc >= ISD::BUILTIN_OP_END || 13430 Opc == ISD::INTRINSIC_WO_CHAIN || 13431 Opc == ISD::INTRINSIC_W_CHAIN || 13432 Opc == ISD::INTRINSIC_VOID) && 13433 "Should use MaskedValueIsZero if you don't know whether Op" 13434 " is a target node!"); 13435 13436 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 13437 switch (Opc) { 13438 default: break; 13439 case X86ISD::ADD: 13440 case X86ISD::SUB: 13441 case X86ISD::ADC: 13442 case X86ISD::SBB: 13443 case X86ISD::SMUL: 13444 case X86ISD::UMUL: 13445 case X86ISD::INC: 13446 case X86ISD::DEC: 13447 case X86ISD::OR: 13448 case X86ISD::XOR: 13449 case X86ISD::AND: 13450 // These nodes' second result is a boolean. 13451 if (Op.getResNo() == 0) 13452 break; 13453 // Fallthrough 13454 case X86ISD::SETCC: 13455 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 13456 break; 13457 case ISD::INTRINSIC_WO_CHAIN: { 13458 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13459 unsigned NumLoBits = 0; 13460 switch (IntId) { 13461 default: break; 13462 case Intrinsic::x86_sse_movmsk_ps: 13463 case Intrinsic::x86_avx_movmsk_ps_256: 13464 case Intrinsic::x86_sse2_movmsk_pd: 13465 case Intrinsic::x86_avx_movmsk_pd_256: 13466 case Intrinsic::x86_mmx_pmovmskb: 13467 case Intrinsic::x86_sse2_pmovmskb_128: 13468 case Intrinsic::x86_avx2_pmovmskb: { 13469 // High bits of movmskp{s|d}, pmovmskb are known zero. 13470 switch (IntId) { 13471 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 13472 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 13473 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 13474 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 13475 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 13476 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 13477 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 13478 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 13479 } 13480 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 13481 break; 13482 } 13483 } 13484 break; 13485 } 13486 } 13487} 13488 13489unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 13490 unsigned Depth) const { 13491 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 13492 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 13493 return Op.getValueType().getScalarType().getSizeInBits(); 13494 13495 // Fallback case. 13496 return 1; 13497} 13498 13499/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 13500/// node is a GlobalAddress + offset. 13501bool X86TargetLowering::isGAPlusOffset(SDNode *N, 13502 const GlobalValue* &GA, 13503 int64_t &Offset) const { 13504 if (N->getOpcode() == X86ISD::Wrapper) { 13505 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 13506 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 13507 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 13508 return true; 13509 } 13510 } 13511 return TargetLowering::isGAPlusOffset(N, GA, Offset); 13512} 13513 13514/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 13515/// same as extracting the high 128-bit part of 256-bit vector and then 13516/// inserting the result into the low part of a new 256-bit vector 13517static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 13518 EVT VT = SVOp->getValueType(0); 13519 unsigned NumElems = VT.getVectorNumElements(); 13520 13521 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 13522 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 13523 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 13524 SVOp->getMaskElt(j) >= 0) 13525 return false; 13526 13527 return true; 13528} 13529 13530/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 13531/// same as extracting the low 128-bit part of 256-bit vector and then 13532/// inserting the result into the high part of a new 256-bit vector 13533static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 13534 EVT VT = SVOp->getValueType(0); 13535 unsigned NumElems = VT.getVectorNumElements(); 13536 13537 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 13538 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 13539 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 13540 SVOp->getMaskElt(j) >= 0) 13541 return false; 13542 13543 return true; 13544} 13545 13546/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 13547static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 13548 TargetLowering::DAGCombinerInfo &DCI, 13549 const X86Subtarget* Subtarget) { 13550 DebugLoc dl = N->getDebugLoc(); 13551 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 13552 SDValue V1 = SVOp->getOperand(0); 13553 SDValue V2 = SVOp->getOperand(1); 13554 EVT VT = SVOp->getValueType(0); 13555 unsigned NumElems = VT.getVectorNumElements(); 13556 13557 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 13558 V2.getOpcode() == ISD::CONCAT_VECTORS) { 13559 // 13560 // 0,0,0,... 13561 // | 13562 // V UNDEF BUILD_VECTOR UNDEF 13563 // \ / \ / 13564 // CONCAT_VECTOR CONCAT_VECTOR 13565 // \ / 13566 // \ / 13567 // RESULT: V + zero extended 13568 // 13569 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 13570 V2.getOperand(1).getOpcode() != ISD::UNDEF || 13571 V1.getOperand(1).getOpcode() != ISD::UNDEF) 13572 return SDValue(); 13573 13574 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 13575 return SDValue(); 13576 13577 // To match the shuffle mask, the first half of the mask should 13578 // be exactly the first vector, and all the rest a splat with the 13579 // first element of the second one. 13580 for (unsigned i = 0; i != NumElems/2; ++i) 13581 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 13582 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 13583 return SDValue(); 13584 13585 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 13586 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 13587 if (Ld->hasNUsesOfValue(1, 0)) { 13588 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 13589 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 13590 SDValue ResNode = 13591 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, 13592 Ld->getMemoryVT(), 13593 Ld->getPointerInfo(), 13594 Ld->getAlignment(), 13595 false/*isVolatile*/, true/*ReadMem*/, 13596 false/*WriteMem*/); 13597 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 13598 } 13599 } 13600 13601 // Emit a zeroed vector and insert the desired subvector on its 13602 // first half. 13603 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 13604 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 13605 return DCI.CombineTo(N, InsV); 13606 } 13607 13608 //===--------------------------------------------------------------------===// 13609 // Combine some shuffles into subvector extracts and inserts: 13610 // 13611 13612 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 13613 if (isShuffleHigh128VectorInsertLow(SVOp)) { 13614 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 13615 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 13616 return DCI.CombineTo(N, InsV); 13617 } 13618 13619 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 13620 if (isShuffleLow128VectorInsertHigh(SVOp)) { 13621 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 13622 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 13623 return DCI.CombineTo(N, InsV); 13624 } 13625 13626 return SDValue(); 13627} 13628 13629/// PerformShuffleCombine - Performs several different shuffle combines. 13630static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 13631 TargetLowering::DAGCombinerInfo &DCI, 13632 const X86Subtarget *Subtarget) { 13633 DebugLoc dl = N->getDebugLoc(); 13634 EVT VT = N->getValueType(0); 13635 13636 // Don't create instructions with illegal types after legalize types has run. 13637 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13638 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 13639 return SDValue(); 13640 13641 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 13642 if (Subtarget->hasAVX() && VT.is256BitVector() && 13643 N->getOpcode() == ISD::VECTOR_SHUFFLE) 13644 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 13645 13646 // Only handle 128 wide vector from here on. 13647 if (!VT.is128BitVector()) 13648 return SDValue(); 13649 13650 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 13651 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 13652 // consecutive, non-overlapping, and in the right order. 13653 SmallVector<SDValue, 16> Elts; 13654 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 13655 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 13656 13657 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 13658} 13659 13660 13661/// PerformTruncateCombine - Converts truncate operation to 13662/// a sequence of vector shuffle operations. 13663/// It is possible when we truncate 256-bit vector to 128-bit vector 13664static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 13665 TargetLowering::DAGCombinerInfo &DCI, 13666 const X86Subtarget *Subtarget) { 13667 if (!DCI.isBeforeLegalizeOps()) 13668 return SDValue(); 13669 13670 if (!Subtarget->hasAVX()) 13671 return SDValue(); 13672 13673 EVT VT = N->getValueType(0); 13674 SDValue Op = N->getOperand(0); 13675 EVT OpVT = Op.getValueType(); 13676 DebugLoc dl = N->getDebugLoc(); 13677 13678 if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { 13679 13680 if (Subtarget->hasAVX2()) { 13681 // AVX2: v4i64 -> v4i32 13682 13683 // VPERMD 13684 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 13685 13686 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); 13687 Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), 13688 ShufMask); 13689 13690 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, 13691 DAG.getIntPtrConstant(0)); 13692 } 13693 13694 // AVX: v4i64 -> v4i32 13695 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13696 DAG.getIntPtrConstant(0)); 13697 13698 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13699 DAG.getIntPtrConstant(2)); 13700 13701 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 13702 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 13703 13704 // PSHUFD 13705 static const int ShufMask1[] = {0, 2, 0, 0}; 13706 13707 SDValue Undef = DAG.getUNDEF(VT); 13708 OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1); 13709 OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1); 13710 13711 // MOVLHPS 13712 static const int ShufMask2[] = {0, 1, 4, 5}; 13713 13714 return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); 13715 } 13716 13717 if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { 13718 13719 if (Subtarget->hasAVX2()) { 13720 // AVX2: v8i32 -> v8i16 13721 13722 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); 13723 13724 // PSHUFB 13725 SmallVector<SDValue,32> pshufbMask; 13726 for (unsigned i = 0; i < 2; ++i) { 13727 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 13728 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 13729 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 13730 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 13731 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 13732 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 13733 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 13734 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 13735 for (unsigned j = 0; j < 8; ++j) 13736 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 13737 } 13738 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, 13739 &pshufbMask[0], 32); 13740 Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); 13741 13742 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); 13743 13744 static const int ShufMask[] = {0, 2, -1, -1}; 13745 Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), 13746 &ShufMask[0]); 13747 13748 Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13749 DAG.getIntPtrConstant(0)); 13750 13751 return DAG.getNode(ISD::BITCAST, dl, VT, Op); 13752 } 13753 13754 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 13755 DAG.getIntPtrConstant(0)); 13756 13757 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 13758 DAG.getIntPtrConstant(4)); 13759 13760 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); 13761 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); 13762 13763 // PSHUFB 13764 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 13765 -1, -1, -1, -1, -1, -1, -1, -1}; 13766 13767 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 13768 OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1); 13769 OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1); 13770 13771 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 13772 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 13773 13774 // MOVLHPS 13775 static const int ShufMask2[] = {0, 1, 4, 5}; 13776 13777 SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); 13778 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); 13779 } 13780 13781 return SDValue(); 13782} 13783 13784/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 13785/// specific shuffle of a load can be folded into a single element load. 13786/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 13787/// shuffles have been customed lowered so we need to handle those here. 13788static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 13789 TargetLowering::DAGCombinerInfo &DCI) { 13790 if (DCI.isBeforeLegalizeOps()) 13791 return SDValue(); 13792 13793 SDValue InVec = N->getOperand(0); 13794 SDValue EltNo = N->getOperand(1); 13795 13796 if (!isa<ConstantSDNode>(EltNo)) 13797 return SDValue(); 13798 13799 EVT VT = InVec.getValueType(); 13800 13801 bool HasShuffleIntoBitcast = false; 13802 if (InVec.getOpcode() == ISD::BITCAST) { 13803 // Don't duplicate a load with other uses. 13804 if (!InVec.hasOneUse()) 13805 return SDValue(); 13806 EVT BCVT = InVec.getOperand(0).getValueType(); 13807 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 13808 return SDValue(); 13809 InVec = InVec.getOperand(0); 13810 HasShuffleIntoBitcast = true; 13811 } 13812 13813 if (!isTargetShuffle(InVec.getOpcode())) 13814 return SDValue(); 13815 13816 // Don't duplicate a load with other uses. 13817 if (!InVec.hasOneUse()) 13818 return SDValue(); 13819 13820 SmallVector<int, 16> ShuffleMask; 13821 bool UnaryShuffle; 13822 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 13823 UnaryShuffle)) 13824 return SDValue(); 13825 13826 // Select the input vector, guarding against out of range extract vector. 13827 unsigned NumElems = VT.getVectorNumElements(); 13828 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 13829 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 13830 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 13831 : InVec.getOperand(1); 13832 13833 // If inputs to shuffle are the same for both ops, then allow 2 uses 13834 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 13835 13836 if (LdNode.getOpcode() == ISD::BITCAST) { 13837 // Don't duplicate a load with other uses. 13838 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 13839 return SDValue(); 13840 13841 AllowedUses = 1; // only allow 1 load use if we have a bitcast 13842 LdNode = LdNode.getOperand(0); 13843 } 13844 13845 if (!ISD::isNormalLoad(LdNode.getNode())) 13846 return SDValue(); 13847 13848 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 13849 13850 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 13851 return SDValue(); 13852 13853 if (HasShuffleIntoBitcast) { 13854 // If there's a bitcast before the shuffle, check if the load type and 13855 // alignment is valid. 13856 unsigned Align = LN0->getAlignment(); 13857 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13858 unsigned NewAlign = TLI.getDataLayout()-> 13859 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 13860 13861 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 13862 return SDValue(); 13863 } 13864 13865 // All checks match so transform back to vector_shuffle so that DAG combiner 13866 // can finish the job 13867 DebugLoc dl = N->getDebugLoc(); 13868 13869 // Create shuffle node taking into account the case that its a unary shuffle 13870 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 13871 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 13872 InVec.getOperand(0), Shuffle, 13873 &ShuffleMask[0]); 13874 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 13875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 13876 EltNo); 13877} 13878 13879/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 13880/// generation and convert it from being a bunch of shuffles and extracts 13881/// to a simple store and scalar loads to extract the elements. 13882static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 13883 TargetLowering::DAGCombinerInfo &DCI) { 13884 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 13885 if (NewOp.getNode()) 13886 return NewOp; 13887 13888 SDValue InputVector = N->getOperand(0); 13889 13890 // Only operate on vectors of 4 elements, where the alternative shuffling 13891 // gets to be more expensive. 13892 if (InputVector.getValueType() != MVT::v4i32) 13893 return SDValue(); 13894 13895 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 13896 // single use which is a sign-extend or zero-extend, and all elements are 13897 // used. 13898 SmallVector<SDNode *, 4> Uses; 13899 unsigned ExtractedElements = 0; 13900 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 13901 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 13902 if (UI.getUse().getResNo() != InputVector.getResNo()) 13903 return SDValue(); 13904 13905 SDNode *Extract = *UI; 13906 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 13907 return SDValue(); 13908 13909 if (Extract->getValueType(0) != MVT::i32) 13910 return SDValue(); 13911 if (!Extract->hasOneUse()) 13912 return SDValue(); 13913 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 13914 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 13915 return SDValue(); 13916 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 13917 return SDValue(); 13918 13919 // Record which element was extracted. 13920 ExtractedElements |= 13921 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 13922 13923 Uses.push_back(Extract); 13924 } 13925 13926 // If not all the elements were used, this may not be worthwhile. 13927 if (ExtractedElements != 15) 13928 return SDValue(); 13929 13930 // Ok, we've now decided to do the transformation. 13931 DebugLoc dl = InputVector.getDebugLoc(); 13932 13933 // Store the value to a temporary stack slot. 13934 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 13935 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 13936 MachinePointerInfo(), false, false, 0); 13937 13938 // Replace each use (extract) with a load of the appropriate element. 13939 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 13940 UE = Uses.end(); UI != UE; ++UI) { 13941 SDNode *Extract = *UI; 13942 13943 // cOMpute the element's address. 13944 SDValue Idx = Extract->getOperand(1); 13945 unsigned EltSize = 13946 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 13947 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 13948 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13949 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 13950 13951 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 13952 StackPtr, OffsetVal); 13953 13954 // Load the scalar. 13955 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 13956 ScalarAddr, MachinePointerInfo(), 13957 false, false, false, 0); 13958 13959 // Replace the exact with the load. 13960 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 13961 } 13962 13963 // The replacement was made in place; don't return anything. 13964 return SDValue(); 13965} 13966 13967/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 13968/// nodes. 13969static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 13970 TargetLowering::DAGCombinerInfo &DCI, 13971 const X86Subtarget *Subtarget) { 13972 DebugLoc DL = N->getDebugLoc(); 13973 SDValue Cond = N->getOperand(0); 13974 // Get the LHS/RHS of the select. 13975 SDValue LHS = N->getOperand(1); 13976 SDValue RHS = N->getOperand(2); 13977 EVT VT = LHS.getValueType(); 13978 13979 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 13980 // instructions match the semantics of the common C idiom x<y?x:y but not 13981 // x<=y?x:y, because of how they handle negative zero (which can be 13982 // ignored in unsafe-math mode). 13983 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 13984 VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 13985 (Subtarget->hasSSE2() || 13986 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 13987 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13988 13989 unsigned Opcode = 0; 13990 // Check for x CC y ? x : y. 13991 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 13992 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 13993 switch (CC) { 13994 default: break; 13995 case ISD::SETULT: 13996 // Converting this to a min would handle NaNs incorrectly, and swapping 13997 // the operands would cause it to handle comparisons between positive 13998 // and negative zero incorrectly. 13999 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 14000 if (!DAG.getTarget().Options.UnsafeFPMath && 14001 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 14002 break; 14003 std::swap(LHS, RHS); 14004 } 14005 Opcode = X86ISD::FMIN; 14006 break; 14007 case ISD::SETOLE: 14008 // Converting this to a min would handle comparisons between positive 14009 // and negative zero incorrectly. 14010 if (!DAG.getTarget().Options.UnsafeFPMath && 14011 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 14012 break; 14013 Opcode = X86ISD::FMIN; 14014 break; 14015 case ISD::SETULE: 14016 // Converting this to a min would handle both negative zeros and NaNs 14017 // incorrectly, but we can swap the operands to fix both. 14018 std::swap(LHS, RHS); 14019 case ISD::SETOLT: 14020 case ISD::SETLT: 14021 case ISD::SETLE: 14022 Opcode = X86ISD::FMIN; 14023 break; 14024 14025 case ISD::SETOGE: 14026 // Converting this to a max would handle comparisons between positive 14027 // and negative zero incorrectly. 14028 if (!DAG.getTarget().Options.UnsafeFPMath && 14029 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 14030 break; 14031 Opcode = X86ISD::FMAX; 14032 break; 14033 case ISD::SETUGT: 14034 // Converting this to a max would handle NaNs incorrectly, and swapping 14035 // the operands would cause it to handle comparisons between positive 14036 // and negative zero incorrectly. 14037 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 14038 if (!DAG.getTarget().Options.UnsafeFPMath && 14039 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 14040 break; 14041 std::swap(LHS, RHS); 14042 } 14043 Opcode = X86ISD::FMAX; 14044 break; 14045 case ISD::SETUGE: 14046 // Converting this to a max would handle both negative zeros and NaNs 14047 // incorrectly, but we can swap the operands to fix both. 14048 std::swap(LHS, RHS); 14049 case ISD::SETOGT: 14050 case ISD::SETGT: 14051 case ISD::SETGE: 14052 Opcode = X86ISD::FMAX; 14053 break; 14054 } 14055 // Check for x CC y ? y : x -- a min/max with reversed arms. 14056 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 14057 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 14058 switch (CC) { 14059 default: break; 14060 case ISD::SETOGE: 14061 // Converting this to a min would handle comparisons between positive 14062 // and negative zero incorrectly, and swapping the operands would 14063 // cause it to handle NaNs incorrectly. 14064 if (!DAG.getTarget().Options.UnsafeFPMath && 14065 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 14066 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14067 break; 14068 std::swap(LHS, RHS); 14069 } 14070 Opcode = X86ISD::FMIN; 14071 break; 14072 case ISD::SETUGT: 14073 // Converting this to a min would handle NaNs incorrectly. 14074 if (!DAG.getTarget().Options.UnsafeFPMath && 14075 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 14076 break; 14077 Opcode = X86ISD::FMIN; 14078 break; 14079 case ISD::SETUGE: 14080 // Converting this to a min would handle both negative zeros and NaNs 14081 // incorrectly, but we can swap the operands to fix both. 14082 std::swap(LHS, RHS); 14083 case ISD::SETOGT: 14084 case ISD::SETGT: 14085 case ISD::SETGE: 14086 Opcode = X86ISD::FMIN; 14087 break; 14088 14089 case ISD::SETULT: 14090 // Converting this to a max would handle NaNs incorrectly. 14091 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14092 break; 14093 Opcode = X86ISD::FMAX; 14094 break; 14095 case ISD::SETOLE: 14096 // Converting this to a max would handle comparisons between positive 14097 // and negative zero incorrectly, and swapping the operands would 14098 // cause it to handle NaNs incorrectly. 14099 if (!DAG.getTarget().Options.UnsafeFPMath && 14100 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 14101 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14102 break; 14103 std::swap(LHS, RHS); 14104 } 14105 Opcode = X86ISD::FMAX; 14106 break; 14107 case ISD::SETULE: 14108 // Converting this to a max would handle both negative zeros and NaNs 14109 // incorrectly, but we can swap the operands to fix both. 14110 std::swap(LHS, RHS); 14111 case ISD::SETOLT: 14112 case ISD::SETLT: 14113 case ISD::SETLE: 14114 Opcode = X86ISD::FMAX; 14115 break; 14116 } 14117 } 14118 14119 if (Opcode) 14120 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 14121 } 14122 14123 // If this is a select between two integer constants, try to do some 14124 // optimizations. 14125 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 14126 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 14127 // Don't do this for crazy integer types. 14128 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 14129 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 14130 // so that TrueC (the true value) is larger than FalseC. 14131 bool NeedsCondInvert = false; 14132 14133 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 14134 // Efficiently invertible. 14135 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 14136 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 14137 isa<ConstantSDNode>(Cond.getOperand(1))))) { 14138 NeedsCondInvert = true; 14139 std::swap(TrueC, FalseC); 14140 } 14141 14142 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 14143 if (FalseC->getAPIntValue() == 0 && 14144 TrueC->getAPIntValue().isPowerOf2()) { 14145 if (NeedsCondInvert) // Invert the condition if needed. 14146 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14147 DAG.getConstant(1, Cond.getValueType())); 14148 14149 // Zero extend the condition if needed. 14150 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 14151 14152 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 14153 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 14154 DAG.getConstant(ShAmt, MVT::i8)); 14155 } 14156 14157 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 14158 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 14159 if (NeedsCondInvert) // Invert the condition if needed. 14160 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14161 DAG.getConstant(1, Cond.getValueType())); 14162 14163 // Zero extend the condition if needed. 14164 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 14165 FalseC->getValueType(0), Cond); 14166 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14167 SDValue(FalseC, 0)); 14168 } 14169 14170 // Optimize cases that will turn into an LEA instruction. This requires 14171 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 14172 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 14173 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 14174 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 14175 14176 bool isFastMultiplier = false; 14177 if (Diff < 10) { 14178 switch ((unsigned char)Diff) { 14179 default: break; 14180 case 1: // result = add base, cond 14181 case 2: // result = lea base( , cond*2) 14182 case 3: // result = lea base(cond, cond*2) 14183 case 4: // result = lea base( , cond*4) 14184 case 5: // result = lea base(cond, cond*4) 14185 case 8: // result = lea base( , cond*8) 14186 case 9: // result = lea base(cond, cond*8) 14187 isFastMultiplier = true; 14188 break; 14189 } 14190 } 14191 14192 if (isFastMultiplier) { 14193 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 14194 if (NeedsCondInvert) // Invert the condition if needed. 14195 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14196 DAG.getConstant(1, Cond.getValueType())); 14197 14198 // Zero extend the condition if needed. 14199 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 14200 Cond); 14201 // Scale the condition by the difference. 14202 if (Diff != 1) 14203 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 14204 DAG.getConstant(Diff, Cond.getValueType())); 14205 14206 // Add the base if non-zero. 14207 if (FalseC->getAPIntValue() != 0) 14208 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14209 SDValue(FalseC, 0)); 14210 return Cond; 14211 } 14212 } 14213 } 14214 } 14215 14216 // Canonicalize max and min: 14217 // (x > y) ? x : y -> (x >= y) ? x : y 14218 // (x < y) ? x : y -> (x <= y) ? x : y 14219 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 14220 // the need for an extra compare 14221 // against zero. e.g. 14222 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 14223 // subl %esi, %edi 14224 // testl %edi, %edi 14225 // movl $0, %eax 14226 // cmovgl %edi, %eax 14227 // => 14228 // xorl %eax, %eax 14229 // subl %esi, $edi 14230 // cmovsl %eax, %edi 14231 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 14232 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 14233 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 14234 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 14235 switch (CC) { 14236 default: break; 14237 case ISD::SETLT: 14238 case ISD::SETGT: { 14239 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 14240 Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), 14241 Cond.getOperand(0), Cond.getOperand(1), NewCC); 14242 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 14243 } 14244 } 14245 } 14246 14247 // If we know that this node is legal then we know that it is going to be 14248 // matched by one of the SSE/AVX BLEND instructions. These instructions only 14249 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 14250 // to simplify previous instructions. 14251 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14252 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 14253 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 14254 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 14255 14256 // Don't optimize vector selects that map to mask-registers. 14257 if (BitWidth == 1) 14258 return SDValue(); 14259 14260 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 14261 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 14262 14263 APInt KnownZero, KnownOne; 14264 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 14265 DCI.isBeforeLegalizeOps()); 14266 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 14267 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 14268 DCI.CommitTargetLoweringOpt(TLO); 14269 } 14270 14271 return SDValue(); 14272} 14273 14274// Check whether a boolean test is testing a boolean value generated by 14275// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 14276// code. 14277// 14278// Simplify the following patterns: 14279// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 14280// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 14281// to (Op EFLAGS Cond) 14282// 14283// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 14284// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 14285// to (Op EFLAGS !Cond) 14286// 14287// where Op could be BRCOND or CMOV. 14288// 14289static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 14290 // Quit if not CMP and SUB with its value result used. 14291 if (Cmp.getOpcode() != X86ISD::CMP && 14292 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 14293 return SDValue(); 14294 14295 // Quit if not used as a boolean value. 14296 if (CC != X86::COND_E && CC != X86::COND_NE) 14297 return SDValue(); 14298 14299 // Check CMP operands. One of them should be 0 or 1 and the other should be 14300 // an SetCC or extended from it. 14301 SDValue Op1 = Cmp.getOperand(0); 14302 SDValue Op2 = Cmp.getOperand(1); 14303 14304 SDValue SetCC; 14305 const ConstantSDNode* C = 0; 14306 bool needOppositeCond = (CC == X86::COND_E); 14307 14308 if ((C = dyn_cast<ConstantSDNode>(Op1))) 14309 SetCC = Op2; 14310 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 14311 SetCC = Op1; 14312 else // Quit if all operands are not constants. 14313 return SDValue(); 14314 14315 if (C->getZExtValue() == 1) 14316 needOppositeCond = !needOppositeCond; 14317 else if (C->getZExtValue() != 0) 14318 // Quit if the constant is neither 0 or 1. 14319 return SDValue(); 14320 14321 // Skip 'zext' node. 14322 if (SetCC.getOpcode() == ISD::ZERO_EXTEND) 14323 SetCC = SetCC.getOperand(0); 14324 14325 switch (SetCC.getOpcode()) { 14326 case X86ISD::SETCC: 14327 // Set the condition code or opposite one if necessary. 14328 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 14329 if (needOppositeCond) 14330 CC = X86::GetOppositeBranchCondition(CC); 14331 return SetCC.getOperand(1); 14332 case X86ISD::CMOV: { 14333 // Check whether false/true value has canonical one, i.e. 0 or 1. 14334 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 14335 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 14336 // Quit if true value is not a constant. 14337 if (!TVal) 14338 return SDValue(); 14339 // Quit if false value is not a constant. 14340 if (!FVal) { 14341 // A special case for rdrand, where 0 is set if false cond is found. 14342 SDValue Op = SetCC.getOperand(0); 14343 if (Op.getOpcode() != X86ISD::RDRAND) 14344 return SDValue(); 14345 } 14346 // Quit if false value is not the constant 0 or 1. 14347 bool FValIsFalse = true; 14348 if (FVal && FVal->getZExtValue() != 0) { 14349 if (FVal->getZExtValue() != 1) 14350 return SDValue(); 14351 // If FVal is 1, opposite cond is needed. 14352 needOppositeCond = !needOppositeCond; 14353 FValIsFalse = false; 14354 } 14355 // Quit if TVal is not the constant opposite of FVal. 14356 if (FValIsFalse && TVal->getZExtValue() != 1) 14357 return SDValue(); 14358 if (!FValIsFalse && TVal->getZExtValue() != 0) 14359 return SDValue(); 14360 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 14361 if (needOppositeCond) 14362 CC = X86::GetOppositeBranchCondition(CC); 14363 return SetCC.getOperand(3); 14364 } 14365 } 14366 14367 return SDValue(); 14368} 14369 14370/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 14371static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 14372 TargetLowering::DAGCombinerInfo &DCI, 14373 const X86Subtarget *Subtarget) { 14374 DebugLoc DL = N->getDebugLoc(); 14375 14376 // If the flag operand isn't dead, don't touch this CMOV. 14377 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 14378 return SDValue(); 14379 14380 SDValue FalseOp = N->getOperand(0); 14381 SDValue TrueOp = N->getOperand(1); 14382 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 14383 SDValue Cond = N->getOperand(3); 14384 14385 if (CC == X86::COND_E || CC == X86::COND_NE) { 14386 switch (Cond.getOpcode()) { 14387 default: break; 14388 case X86ISD::BSR: 14389 case X86ISD::BSF: 14390 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 14391 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 14392 return (CC == X86::COND_E) ? FalseOp : TrueOp; 14393 } 14394 } 14395 14396 SDValue Flags; 14397 14398 Flags = checkBoolTestSetCCCombine(Cond, CC); 14399 if (Flags.getNode() && 14400 // Extra check as FCMOV only supports a subset of X86 cond. 14401 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 14402 SDValue Ops[] = { FalseOp, TrueOp, 14403 DAG.getConstant(CC, MVT::i8), Flags }; 14404 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 14405 Ops, array_lengthof(Ops)); 14406 } 14407 14408 // If this is a select between two integer constants, try to do some 14409 // optimizations. Note that the operands are ordered the opposite of SELECT 14410 // operands. 14411 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 14412 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 14413 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 14414 // larger than FalseC (the false value). 14415 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 14416 CC = X86::GetOppositeBranchCondition(CC); 14417 std::swap(TrueC, FalseC); 14418 std::swap(TrueOp, FalseOp); 14419 } 14420 14421 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 14422 // This is efficient for any integer data type (including i8/i16) and 14423 // shift amount. 14424 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 14425 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14426 DAG.getConstant(CC, MVT::i8), Cond); 14427 14428 // Zero extend the condition if needed. 14429 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 14430 14431 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 14432 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 14433 DAG.getConstant(ShAmt, MVT::i8)); 14434 if (N->getNumValues() == 2) // Dead flag value? 14435 return DCI.CombineTo(N, Cond, SDValue()); 14436 return Cond; 14437 } 14438 14439 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 14440 // for any integer data type, including i8/i16. 14441 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 14442 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14443 DAG.getConstant(CC, MVT::i8), Cond); 14444 14445 // Zero extend the condition if needed. 14446 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 14447 FalseC->getValueType(0), Cond); 14448 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14449 SDValue(FalseC, 0)); 14450 14451 if (N->getNumValues() == 2) // Dead flag value? 14452 return DCI.CombineTo(N, Cond, SDValue()); 14453 return Cond; 14454 } 14455 14456 // Optimize cases that will turn into an LEA instruction. This requires 14457 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 14458 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 14459 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 14460 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 14461 14462 bool isFastMultiplier = false; 14463 if (Diff < 10) { 14464 switch ((unsigned char)Diff) { 14465 default: break; 14466 case 1: // result = add base, cond 14467 case 2: // result = lea base( , cond*2) 14468 case 3: // result = lea base(cond, cond*2) 14469 case 4: // result = lea base( , cond*4) 14470 case 5: // result = lea base(cond, cond*4) 14471 case 8: // result = lea base( , cond*8) 14472 case 9: // result = lea base(cond, cond*8) 14473 isFastMultiplier = true; 14474 break; 14475 } 14476 } 14477 14478 if (isFastMultiplier) { 14479 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 14480 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14481 DAG.getConstant(CC, MVT::i8), Cond); 14482 // Zero extend the condition if needed. 14483 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 14484 Cond); 14485 // Scale the condition by the difference. 14486 if (Diff != 1) 14487 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 14488 DAG.getConstant(Diff, Cond.getValueType())); 14489 14490 // Add the base if non-zero. 14491 if (FalseC->getAPIntValue() != 0) 14492 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14493 SDValue(FalseC, 0)); 14494 if (N->getNumValues() == 2) // Dead flag value? 14495 return DCI.CombineTo(N, Cond, SDValue()); 14496 return Cond; 14497 } 14498 } 14499 } 14500 } 14501 14502 // Handle these cases: 14503 // (select (x != c), e, c) -> select (x != c), e, x), 14504 // (select (x == c), c, e) -> select (x == c), x, e) 14505 // where the c is an integer constant, and the "select" is the combination 14506 // of CMOV and CMP. 14507 // 14508 // The rationale for this change is that the conditional-move from a constant 14509 // needs two instructions, however, conditional-move from a register needs 14510 // only one instruction. 14511 // 14512 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 14513 // some instruction-combining opportunities. This opt needs to be 14514 // postponed as late as possible. 14515 // 14516 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 14517 // the DCI.xxxx conditions are provided to postpone the optimization as 14518 // late as possible. 14519 14520 ConstantSDNode *CmpAgainst = 0; 14521 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 14522 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 14523 dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) { 14524 14525 if (CC == X86::COND_NE && 14526 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 14527 CC = X86::GetOppositeBranchCondition(CC); 14528 std::swap(TrueOp, FalseOp); 14529 } 14530 14531 if (CC == X86::COND_E && 14532 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 14533 SDValue Ops[] = { FalseOp, Cond.getOperand(0), N->getOperand(2), Cond }; 14534 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, 14535 array_lengthof(Ops)); 14536 } 14537 } 14538 } 14539 14540 return SDValue(); 14541} 14542 14543 14544/// PerformMulCombine - Optimize a single multiply with constant into two 14545/// in order to implement it with two cheaper instructions, e.g. 14546/// LEA + SHL, LEA + LEA. 14547static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 14548 TargetLowering::DAGCombinerInfo &DCI) { 14549 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14550 return SDValue(); 14551 14552 EVT VT = N->getValueType(0); 14553 if (VT != MVT::i64) 14554 return SDValue(); 14555 14556 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14557 if (!C) 14558 return SDValue(); 14559 uint64_t MulAmt = C->getZExtValue(); 14560 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 14561 return SDValue(); 14562 14563 uint64_t MulAmt1 = 0; 14564 uint64_t MulAmt2 = 0; 14565 if ((MulAmt % 9) == 0) { 14566 MulAmt1 = 9; 14567 MulAmt2 = MulAmt / 9; 14568 } else if ((MulAmt % 5) == 0) { 14569 MulAmt1 = 5; 14570 MulAmt2 = MulAmt / 5; 14571 } else if ((MulAmt % 3) == 0) { 14572 MulAmt1 = 3; 14573 MulAmt2 = MulAmt / 3; 14574 } 14575 if (MulAmt2 && 14576 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 14577 DebugLoc DL = N->getDebugLoc(); 14578 14579 if (isPowerOf2_64(MulAmt2) && 14580 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 14581 // If second multiplifer is pow2, issue it first. We want the multiply by 14582 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 14583 // is an add. 14584 std::swap(MulAmt1, MulAmt2); 14585 14586 SDValue NewMul; 14587 if (isPowerOf2_64(MulAmt1)) 14588 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 14589 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 14590 else 14591 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 14592 DAG.getConstant(MulAmt1, VT)); 14593 14594 if (isPowerOf2_64(MulAmt2)) 14595 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 14596 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 14597 else 14598 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 14599 DAG.getConstant(MulAmt2, VT)); 14600 14601 // Do not add new nodes to DAG combiner worklist. 14602 DCI.CombineTo(N, NewMul, false); 14603 } 14604 return SDValue(); 14605} 14606 14607static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 14608 SDValue N0 = N->getOperand(0); 14609 SDValue N1 = N->getOperand(1); 14610 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 14611 EVT VT = N0.getValueType(); 14612 14613 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 14614 // since the result of setcc_c is all zero's or all ones. 14615 if (VT.isInteger() && !VT.isVector() && 14616 N1C && N0.getOpcode() == ISD::AND && 14617 N0.getOperand(1).getOpcode() == ISD::Constant) { 14618 SDValue N00 = N0.getOperand(0); 14619 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 14620 ((N00.getOpcode() == ISD::ANY_EXTEND || 14621 N00.getOpcode() == ISD::ZERO_EXTEND) && 14622 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 14623 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 14624 APInt ShAmt = N1C->getAPIntValue(); 14625 Mask = Mask.shl(ShAmt); 14626 if (Mask != 0) 14627 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 14628 N00, DAG.getConstant(Mask, VT)); 14629 } 14630 } 14631 14632 14633 // Hardware support for vector shifts is sparse which makes us scalarize the 14634 // vector operations in many cases. Also, on sandybridge ADD is faster than 14635 // shl. 14636 // (shl V, 1) -> add V,V 14637 if (isSplatVector(N1.getNode())) { 14638 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 14639 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 14640 // We shift all of the values by one. In many cases we do not have 14641 // hardware support for this operation. This is better expressed as an ADD 14642 // of two values. 14643 if (N1C && (1 == N1C->getZExtValue())) { 14644 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); 14645 } 14646 } 14647 14648 return SDValue(); 14649} 14650 14651/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 14652/// when possible. 14653static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 14654 TargetLowering::DAGCombinerInfo &DCI, 14655 const X86Subtarget *Subtarget) { 14656 EVT VT = N->getValueType(0); 14657 if (N->getOpcode() == ISD::SHL) { 14658 SDValue V = PerformSHLCombine(N, DAG); 14659 if (V.getNode()) return V; 14660 } 14661 14662 // On X86 with SSE2 support, we can transform this to a vector shift if 14663 // all elements are shifted by the same amount. We can't do this in legalize 14664 // because the a constant vector is typically transformed to a constant pool 14665 // so we have no knowledge of the shift amount. 14666 if (!Subtarget->hasSSE2()) 14667 return SDValue(); 14668 14669 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 14670 (!Subtarget->hasAVX2() || 14671 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 14672 return SDValue(); 14673 14674 SDValue ShAmtOp = N->getOperand(1); 14675 EVT EltVT = VT.getVectorElementType(); 14676 DebugLoc DL = N->getDebugLoc(); 14677 SDValue BaseShAmt = SDValue(); 14678 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 14679 unsigned NumElts = VT.getVectorNumElements(); 14680 unsigned i = 0; 14681 for (; i != NumElts; ++i) { 14682 SDValue Arg = ShAmtOp.getOperand(i); 14683 if (Arg.getOpcode() == ISD::UNDEF) continue; 14684 BaseShAmt = Arg; 14685 break; 14686 } 14687 // Handle the case where the build_vector is all undef 14688 // FIXME: Should DAG allow this? 14689 if (i == NumElts) 14690 return SDValue(); 14691 14692 for (; i != NumElts; ++i) { 14693 SDValue Arg = ShAmtOp.getOperand(i); 14694 if (Arg.getOpcode() == ISD::UNDEF) continue; 14695 if (Arg != BaseShAmt) { 14696 return SDValue(); 14697 } 14698 } 14699 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 14700 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 14701 SDValue InVec = ShAmtOp.getOperand(0); 14702 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 14703 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 14704 unsigned i = 0; 14705 for (; i != NumElts; ++i) { 14706 SDValue Arg = InVec.getOperand(i); 14707 if (Arg.getOpcode() == ISD::UNDEF) continue; 14708 BaseShAmt = Arg; 14709 break; 14710 } 14711 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14712 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 14713 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 14714 if (C->getZExtValue() == SplatIdx) 14715 BaseShAmt = InVec.getOperand(1); 14716 } 14717 } 14718 if (BaseShAmt.getNode() == 0) { 14719 // Don't create instructions with illegal types after legalize 14720 // types has run. 14721 if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && 14722 !DCI.isBeforeLegalize()) 14723 return SDValue(); 14724 14725 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 14726 DAG.getIntPtrConstant(0)); 14727 } 14728 } else 14729 return SDValue(); 14730 14731 // The shift amount is an i32. 14732 if (EltVT.bitsGT(MVT::i32)) 14733 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 14734 else if (EltVT.bitsLT(MVT::i32)) 14735 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 14736 14737 // The shift amount is identical so we can do a vector shift. 14738 SDValue ValOp = N->getOperand(0); 14739 switch (N->getOpcode()) { 14740 default: 14741 llvm_unreachable("Unknown shift opcode!"); 14742 case ISD::SHL: 14743 switch (VT.getSimpleVT().SimpleTy) { 14744 default: return SDValue(); 14745 case MVT::v2i64: 14746 case MVT::v4i32: 14747 case MVT::v8i16: 14748 case MVT::v4i64: 14749 case MVT::v8i32: 14750 case MVT::v16i16: 14751 return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); 14752 } 14753 case ISD::SRA: 14754 switch (VT.getSimpleVT().SimpleTy) { 14755 default: return SDValue(); 14756 case MVT::v4i32: 14757 case MVT::v8i16: 14758 case MVT::v8i32: 14759 case MVT::v16i16: 14760 return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); 14761 } 14762 case ISD::SRL: 14763 switch (VT.getSimpleVT().SimpleTy) { 14764 default: return SDValue(); 14765 case MVT::v2i64: 14766 case MVT::v4i32: 14767 case MVT::v8i16: 14768 case MVT::v4i64: 14769 case MVT::v8i32: 14770 case MVT::v16i16: 14771 return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); 14772 } 14773 } 14774} 14775 14776 14777// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 14778// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 14779// and friends. Likewise for OR -> CMPNEQSS. 14780static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 14781 TargetLowering::DAGCombinerInfo &DCI, 14782 const X86Subtarget *Subtarget) { 14783 unsigned opcode; 14784 14785 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 14786 // we're requiring SSE2 for both. 14787 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 14788 SDValue N0 = N->getOperand(0); 14789 SDValue N1 = N->getOperand(1); 14790 SDValue CMP0 = N0->getOperand(1); 14791 SDValue CMP1 = N1->getOperand(1); 14792 DebugLoc DL = N->getDebugLoc(); 14793 14794 // The SETCCs should both refer to the same CMP. 14795 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 14796 return SDValue(); 14797 14798 SDValue CMP00 = CMP0->getOperand(0); 14799 SDValue CMP01 = CMP0->getOperand(1); 14800 EVT VT = CMP00.getValueType(); 14801 14802 if (VT == MVT::f32 || VT == MVT::f64) { 14803 bool ExpectingFlags = false; 14804 // Check for any users that want flags: 14805 for (SDNode::use_iterator UI = N->use_begin(), 14806 UE = N->use_end(); 14807 !ExpectingFlags && UI != UE; ++UI) 14808 switch (UI->getOpcode()) { 14809 default: 14810 case ISD::BR_CC: 14811 case ISD::BRCOND: 14812 case ISD::SELECT: 14813 ExpectingFlags = true; 14814 break; 14815 case ISD::CopyToReg: 14816 case ISD::SIGN_EXTEND: 14817 case ISD::ZERO_EXTEND: 14818 case ISD::ANY_EXTEND: 14819 break; 14820 } 14821 14822 if (!ExpectingFlags) { 14823 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 14824 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 14825 14826 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 14827 X86::CondCode tmp = cc0; 14828 cc0 = cc1; 14829 cc1 = tmp; 14830 } 14831 14832 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 14833 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 14834 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 14835 X86ISD::NodeType NTOperator = is64BitFP ? 14836 X86ISD::FSETCCsd : X86ISD::FSETCCss; 14837 // FIXME: need symbolic constants for these magic numbers. 14838 // See X86ATTInstPrinter.cpp:printSSECC(). 14839 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 14840 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 14841 DAG.getConstant(x86cc, MVT::i8)); 14842 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 14843 OnesOrZeroesF); 14844 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 14845 DAG.getConstant(1, MVT::i32)); 14846 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 14847 return OneBitOfTruth; 14848 } 14849 } 14850 } 14851 } 14852 return SDValue(); 14853} 14854 14855/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 14856/// so it can be folded inside ANDNP. 14857static bool CanFoldXORWithAllOnes(const SDNode *N) { 14858 EVT VT = N->getValueType(0); 14859 14860 // Match direct AllOnes for 128 and 256-bit vectors 14861 if (ISD::isBuildVectorAllOnes(N)) 14862 return true; 14863 14864 // Look through a bit convert. 14865 if (N->getOpcode() == ISD::BITCAST) 14866 N = N->getOperand(0).getNode(); 14867 14868 // Sometimes the operand may come from a insert_subvector building a 256-bit 14869 // allones vector 14870 if (VT.is256BitVector() && 14871 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 14872 SDValue V1 = N->getOperand(0); 14873 SDValue V2 = N->getOperand(1); 14874 14875 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 14876 V1.getOperand(0).getOpcode() == ISD::UNDEF && 14877 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 14878 ISD::isBuildVectorAllOnes(V2.getNode())) 14879 return true; 14880 } 14881 14882 return false; 14883} 14884 14885static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 14886 TargetLowering::DAGCombinerInfo &DCI, 14887 const X86Subtarget *Subtarget) { 14888 if (DCI.isBeforeLegalizeOps()) 14889 return SDValue(); 14890 14891 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 14892 if (R.getNode()) 14893 return R; 14894 14895 EVT VT = N->getValueType(0); 14896 14897 // Create ANDN, BLSI, and BLSR instructions 14898 // BLSI is X & (-X) 14899 // BLSR is X & (X-1) 14900 if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { 14901 SDValue N0 = N->getOperand(0); 14902 SDValue N1 = N->getOperand(1); 14903 DebugLoc DL = N->getDebugLoc(); 14904 14905 // Check LHS for not 14906 if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) 14907 return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); 14908 // Check RHS for not 14909 if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) 14910 return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); 14911 14912 // Check LHS for neg 14913 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 14914 isZero(N0.getOperand(0))) 14915 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 14916 14917 // Check RHS for neg 14918 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 14919 isZero(N1.getOperand(0))) 14920 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 14921 14922 // Check LHS for X-1 14923 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 14924 isAllOnes(N0.getOperand(1))) 14925 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 14926 14927 // Check RHS for X-1 14928 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 14929 isAllOnes(N1.getOperand(1))) 14930 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 14931 14932 return SDValue(); 14933 } 14934 14935 // Want to form ANDNP nodes: 14936 // 1) In the hopes of then easily combining them with OR and AND nodes 14937 // to form PBLEND/PSIGN. 14938 // 2) To match ANDN packed intrinsics 14939 if (VT != MVT::v2i64 && VT != MVT::v4i64) 14940 return SDValue(); 14941 14942 SDValue N0 = N->getOperand(0); 14943 SDValue N1 = N->getOperand(1); 14944 DebugLoc DL = N->getDebugLoc(); 14945 14946 // Check LHS for vnot 14947 if (N0.getOpcode() == ISD::XOR && 14948 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 14949 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 14950 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 14951 14952 // Check RHS for vnot 14953 if (N1.getOpcode() == ISD::XOR && 14954 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 14955 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 14956 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 14957 14958 return SDValue(); 14959} 14960 14961static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 14962 TargetLowering::DAGCombinerInfo &DCI, 14963 const X86Subtarget *Subtarget) { 14964 if (DCI.isBeforeLegalizeOps()) 14965 return SDValue(); 14966 14967 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 14968 if (R.getNode()) 14969 return R; 14970 14971 EVT VT = N->getValueType(0); 14972 14973 SDValue N0 = N->getOperand(0); 14974 SDValue N1 = N->getOperand(1); 14975 14976 // look for psign/blend 14977 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 14978 if (!Subtarget->hasSSSE3() || 14979 (VT == MVT::v4i64 && !Subtarget->hasAVX2())) 14980 return SDValue(); 14981 14982 // Canonicalize pandn to RHS 14983 if (N0.getOpcode() == X86ISD::ANDNP) 14984 std::swap(N0, N1); 14985 // or (and (m, y), (pandn m, x)) 14986 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 14987 SDValue Mask = N1.getOperand(0); 14988 SDValue X = N1.getOperand(1); 14989 SDValue Y; 14990 if (N0.getOperand(0) == Mask) 14991 Y = N0.getOperand(1); 14992 if (N0.getOperand(1) == Mask) 14993 Y = N0.getOperand(0); 14994 14995 // Check to see if the mask appeared in both the AND and ANDNP and 14996 if (!Y.getNode()) 14997 return SDValue(); 14998 14999 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 15000 // Look through mask bitcast. 15001 if (Mask.getOpcode() == ISD::BITCAST) 15002 Mask = Mask.getOperand(0); 15003 if (X.getOpcode() == ISD::BITCAST) 15004 X = X.getOperand(0); 15005 if (Y.getOpcode() == ISD::BITCAST) 15006 Y = Y.getOperand(0); 15007 15008 EVT MaskVT = Mask.getValueType(); 15009 15010 // Validate that the Mask operand is a vector sra node. 15011 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 15012 // there is no psrai.b 15013 if (Mask.getOpcode() != X86ISD::VSRAI) 15014 return SDValue(); 15015 15016 // Check that the SRA is all signbits. 15017 SDValue SraC = Mask.getOperand(1); 15018 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 15019 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 15020 if ((SraAmt + 1) != EltBits) 15021 return SDValue(); 15022 15023 DebugLoc DL = N->getDebugLoc(); 15024 15025 // Now we know we at least have a plendvb with the mask val. See if 15026 // we can form a psignb/w/d. 15027 // psign = x.type == y.type == mask.type && y = sub(0, x); 15028 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 15029 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 15030 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 15031 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 15032 "Unsupported VT for PSIGN"); 15033 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 15034 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 15035 } 15036 // PBLENDVB only available on SSE 4.1 15037 if (!Subtarget->hasSSE41()) 15038 return SDValue(); 15039 15040 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 15041 15042 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 15043 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 15044 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 15045 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 15046 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 15047 } 15048 } 15049 15050 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 15051 return SDValue(); 15052 15053 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 15054 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 15055 std::swap(N0, N1); 15056 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 15057 return SDValue(); 15058 if (!N0.hasOneUse() || !N1.hasOneUse()) 15059 return SDValue(); 15060 15061 SDValue ShAmt0 = N0.getOperand(1); 15062 if (ShAmt0.getValueType() != MVT::i8) 15063 return SDValue(); 15064 SDValue ShAmt1 = N1.getOperand(1); 15065 if (ShAmt1.getValueType() != MVT::i8) 15066 return SDValue(); 15067 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 15068 ShAmt0 = ShAmt0.getOperand(0); 15069 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 15070 ShAmt1 = ShAmt1.getOperand(0); 15071 15072 DebugLoc DL = N->getDebugLoc(); 15073 unsigned Opc = X86ISD::SHLD; 15074 SDValue Op0 = N0.getOperand(0); 15075 SDValue Op1 = N1.getOperand(0); 15076 if (ShAmt0.getOpcode() == ISD::SUB) { 15077 Opc = X86ISD::SHRD; 15078 std::swap(Op0, Op1); 15079 std::swap(ShAmt0, ShAmt1); 15080 } 15081 15082 unsigned Bits = VT.getSizeInBits(); 15083 if (ShAmt1.getOpcode() == ISD::SUB) { 15084 SDValue Sum = ShAmt1.getOperand(0); 15085 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 15086 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 15087 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 15088 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 15089 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 15090 return DAG.getNode(Opc, DL, VT, 15091 Op0, Op1, 15092 DAG.getNode(ISD::TRUNCATE, DL, 15093 MVT::i8, ShAmt0)); 15094 } 15095 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 15096 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 15097 if (ShAmt0C && 15098 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 15099 return DAG.getNode(Opc, DL, VT, 15100 N0.getOperand(0), N1.getOperand(0), 15101 DAG.getNode(ISD::TRUNCATE, DL, 15102 MVT::i8, ShAmt0)); 15103 } 15104 15105 return SDValue(); 15106} 15107 15108// Generate NEG and CMOV for integer abs. 15109static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 15110 EVT VT = N->getValueType(0); 15111 15112 // Since X86 does not have CMOV for 8-bit integer, we don't convert 15113 // 8-bit integer abs to NEG and CMOV. 15114 if (VT.isInteger() && VT.getSizeInBits() == 8) 15115 return SDValue(); 15116 15117 SDValue N0 = N->getOperand(0); 15118 SDValue N1 = N->getOperand(1); 15119 DebugLoc DL = N->getDebugLoc(); 15120 15121 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 15122 // and change it to SUB and CMOV. 15123 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 15124 N0.getOpcode() == ISD::ADD && 15125 N0.getOperand(1) == N1 && 15126 N1.getOpcode() == ISD::SRA && 15127 N1.getOperand(0) == N0.getOperand(0)) 15128 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 15129 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 15130 // Generate SUB & CMOV. 15131 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 15132 DAG.getConstant(0, VT), N0.getOperand(0)); 15133 15134 SDValue Ops[] = { N0.getOperand(0), Neg, 15135 DAG.getConstant(X86::COND_GE, MVT::i8), 15136 SDValue(Neg.getNode(), 1) }; 15137 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 15138 Ops, array_lengthof(Ops)); 15139 } 15140 return SDValue(); 15141} 15142 15143// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 15144static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 15145 TargetLowering::DAGCombinerInfo &DCI, 15146 const X86Subtarget *Subtarget) { 15147 if (DCI.isBeforeLegalizeOps()) 15148 return SDValue(); 15149 15150 if (Subtarget->hasCMov()) { 15151 SDValue RV = performIntegerAbsCombine(N, DAG); 15152 if (RV.getNode()) 15153 return RV; 15154 } 15155 15156 // Try forming BMI if it is available. 15157 if (!Subtarget->hasBMI()) 15158 return SDValue(); 15159 15160 EVT VT = N->getValueType(0); 15161 15162 if (VT != MVT::i32 && VT != MVT::i64) 15163 return SDValue(); 15164 15165 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 15166 15167 // Create BLSMSK instructions by finding X ^ (X-1) 15168 SDValue N0 = N->getOperand(0); 15169 SDValue N1 = N->getOperand(1); 15170 DebugLoc DL = N->getDebugLoc(); 15171 15172 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 15173 isAllOnes(N0.getOperand(1))) 15174 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 15175 15176 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 15177 isAllOnes(N1.getOperand(1))) 15178 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 15179 15180 return SDValue(); 15181} 15182 15183/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 15184static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 15185 TargetLowering::DAGCombinerInfo &DCI, 15186 const X86Subtarget *Subtarget) { 15187 LoadSDNode *Ld = cast<LoadSDNode>(N); 15188 EVT RegVT = Ld->getValueType(0); 15189 EVT MemVT = Ld->getMemoryVT(); 15190 DebugLoc dl = Ld->getDebugLoc(); 15191 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15192 15193 ISD::LoadExtType Ext = Ld->getExtensionType(); 15194 15195 // If this is a vector EXT Load then attempt to optimize it using a 15196 // shuffle. We need SSE4 for the shuffles. 15197 // TODO: It is possible to support ZExt by zeroing the undef values 15198 // during the shuffle phase or after the shuffle. 15199 if (RegVT.isVector() && RegVT.isInteger() && 15200 Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { 15201 assert(MemVT != RegVT && "Cannot extend to the same type"); 15202 assert(MemVT.isVector() && "Must load a vector from memory"); 15203 15204 unsigned NumElems = RegVT.getVectorNumElements(); 15205 unsigned RegSz = RegVT.getSizeInBits(); 15206 unsigned MemSz = MemVT.getSizeInBits(); 15207 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 15208 15209 // All sizes must be a power of two. 15210 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 15211 return SDValue(); 15212 15213 // Attempt to load the original value using scalar loads. 15214 // Find the largest scalar type that divides the total loaded size. 15215 MVT SclrLoadTy = MVT::i8; 15216 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 15217 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 15218 MVT Tp = (MVT::SimpleValueType)tp; 15219 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 15220 SclrLoadTy = Tp; 15221 } 15222 } 15223 15224 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 15225 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 15226 (64 <= MemSz)) 15227 SclrLoadTy = MVT::f64; 15228 15229 // Calculate the number of scalar loads that we need to perform 15230 // in order to load our vector from memory. 15231 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 15232 15233 // Represent our vector as a sequence of elements which are the 15234 // largest scalar that we can load. 15235 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 15236 RegSz/SclrLoadTy.getSizeInBits()); 15237 15238 // Represent the data using the same element type that is stored in 15239 // memory. In practice, we ''widen'' MemVT. 15240 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 15241 RegSz/MemVT.getScalarType().getSizeInBits()); 15242 15243 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 15244 "Invalid vector type"); 15245 15246 // We can't shuffle using an illegal type. 15247 if (!TLI.isTypeLegal(WideVecVT)) 15248 return SDValue(); 15249 15250 SmallVector<SDValue, 8> Chains; 15251 SDValue Ptr = Ld->getBasePtr(); 15252 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 15253 TLI.getPointerTy()); 15254 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 15255 15256 for (unsigned i = 0; i < NumLoads; ++i) { 15257 // Perform a single load. 15258 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 15259 Ptr, Ld->getPointerInfo(), 15260 Ld->isVolatile(), Ld->isNonTemporal(), 15261 Ld->isInvariant(), Ld->getAlignment()); 15262 Chains.push_back(ScalarLoad.getValue(1)); 15263 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 15264 // another round of DAGCombining. 15265 if (i == 0) 15266 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 15267 else 15268 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 15269 ScalarLoad, DAG.getIntPtrConstant(i)); 15270 15271 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 15272 } 15273 15274 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 15275 Chains.size()); 15276 15277 // Bitcast the loaded value to a vector of the original element type, in 15278 // the size of the target vector type. 15279 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 15280 unsigned SizeRatio = RegSz/MemSz; 15281 15282 // Redistribute the loaded elements into the different locations. 15283 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15284 for (unsigned i = 0; i != NumElems; ++i) 15285 ShuffleVec[i*SizeRatio] = i; 15286 15287 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 15288 DAG.getUNDEF(WideVecVT), 15289 &ShuffleVec[0]); 15290 15291 // Bitcast to the requested type. 15292 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 15293 // Replace the original load with the new sequence 15294 // and return the new chain. 15295 return DCI.CombineTo(N, Shuff, TF, true); 15296 } 15297 15298 return SDValue(); 15299} 15300 15301/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 15302static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 15303 const X86Subtarget *Subtarget) { 15304 StoreSDNode *St = cast<StoreSDNode>(N); 15305 EVT VT = St->getValue().getValueType(); 15306 EVT StVT = St->getMemoryVT(); 15307 DebugLoc dl = St->getDebugLoc(); 15308 SDValue StoredVal = St->getOperand(1); 15309 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15310 15311 // If we are saving a concatenation of two XMM registers, perform two stores. 15312 // On Sandy Bridge, 256-bit memory operations are executed by two 15313 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 15314 // memory operation. 15315 if (VT.is256BitVector() && !Subtarget->hasAVX2() && 15316 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 15317 StoredVal.getNumOperands() == 2) { 15318 SDValue Value0 = StoredVal.getOperand(0); 15319 SDValue Value1 = StoredVal.getOperand(1); 15320 15321 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 15322 SDValue Ptr0 = St->getBasePtr(); 15323 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 15324 15325 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 15326 St->getPointerInfo(), St->isVolatile(), 15327 St->isNonTemporal(), St->getAlignment()); 15328 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 15329 St->getPointerInfo(), St->isVolatile(), 15330 St->isNonTemporal(), St->getAlignment()); 15331 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 15332 } 15333 15334 // Optimize trunc store (of multiple scalars) to shuffle and store. 15335 // First, pack all of the elements in one place. Next, store to memory 15336 // in fewer chunks. 15337 if (St->isTruncatingStore() && VT.isVector()) { 15338 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15339 unsigned NumElems = VT.getVectorNumElements(); 15340 assert(StVT != VT && "Cannot truncate to the same type"); 15341 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 15342 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 15343 15344 // From, To sizes and ElemCount must be pow of two 15345 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 15346 // We are going to use the original vector elt for storing. 15347 // Accumulated smaller vector elements must be a multiple of the store size. 15348 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 15349 15350 unsigned SizeRatio = FromSz / ToSz; 15351 15352 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 15353 15354 // Create a type on which we perform the shuffle 15355 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 15356 StVT.getScalarType(), NumElems*SizeRatio); 15357 15358 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 15359 15360 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 15361 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15362 for (unsigned i = 0; i != NumElems; ++i) 15363 ShuffleVec[i] = i * SizeRatio; 15364 15365 // Can't shuffle using an illegal type. 15366 if (!TLI.isTypeLegal(WideVecVT)) 15367 return SDValue(); 15368 15369 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 15370 DAG.getUNDEF(WideVecVT), 15371 &ShuffleVec[0]); 15372 // At this point all of the data is stored at the bottom of the 15373 // register. We now need to save it to mem. 15374 15375 // Find the largest store unit 15376 MVT StoreType = MVT::i8; 15377 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 15378 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 15379 MVT Tp = (MVT::SimpleValueType)tp; 15380 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 15381 StoreType = Tp; 15382 } 15383 15384 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 15385 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 15386 (64 <= NumElems * ToSz)) 15387 StoreType = MVT::f64; 15388 15389 // Bitcast the original vector into a vector of store-size units 15390 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 15391 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 15392 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 15393 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 15394 SmallVector<SDValue, 8> Chains; 15395 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 15396 TLI.getPointerTy()); 15397 SDValue Ptr = St->getBasePtr(); 15398 15399 // Perform one or more big stores into memory. 15400 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 15401 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 15402 StoreType, ShuffWide, 15403 DAG.getIntPtrConstant(i)); 15404 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 15405 St->getPointerInfo(), St->isVolatile(), 15406 St->isNonTemporal(), St->getAlignment()); 15407 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 15408 Chains.push_back(Ch); 15409 } 15410 15411 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 15412 Chains.size()); 15413 } 15414 15415 15416 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 15417 // the FP state in cases where an emms may be missing. 15418 // A preferable solution to the general problem is to figure out the right 15419 // places to insert EMMS. This qualifies as a quick hack. 15420 15421 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 15422 if (VT.getSizeInBits() != 64) 15423 return SDValue(); 15424 15425 const Function *F = DAG.getMachineFunction().getFunction(); 15426 bool NoImplicitFloatOps = F->getFnAttributes(). 15427 hasAttribute(Attributes::NoImplicitFloat); 15428 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 15429 && Subtarget->hasSSE2(); 15430 if ((VT.isVector() || 15431 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 15432 isa<LoadSDNode>(St->getValue()) && 15433 !cast<LoadSDNode>(St->getValue())->isVolatile() && 15434 St->getChain().hasOneUse() && !St->isVolatile()) { 15435 SDNode* LdVal = St->getValue().getNode(); 15436 LoadSDNode *Ld = 0; 15437 int TokenFactorIndex = -1; 15438 SmallVector<SDValue, 8> Ops; 15439 SDNode* ChainVal = St->getChain().getNode(); 15440 // Must be a store of a load. We currently handle two cases: the load 15441 // is a direct child, and it's under an intervening TokenFactor. It is 15442 // possible to dig deeper under nested TokenFactors. 15443 if (ChainVal == LdVal) 15444 Ld = cast<LoadSDNode>(St->getChain()); 15445 else if (St->getValue().hasOneUse() && 15446 ChainVal->getOpcode() == ISD::TokenFactor) { 15447 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 15448 if (ChainVal->getOperand(i).getNode() == LdVal) { 15449 TokenFactorIndex = i; 15450 Ld = cast<LoadSDNode>(St->getValue()); 15451 } else 15452 Ops.push_back(ChainVal->getOperand(i)); 15453 } 15454 } 15455 15456 if (!Ld || !ISD::isNormalLoad(Ld)) 15457 return SDValue(); 15458 15459 // If this is not the MMX case, i.e. we are just turning i64 load/store 15460 // into f64 load/store, avoid the transformation if there are multiple 15461 // uses of the loaded value. 15462 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 15463 return SDValue(); 15464 15465 DebugLoc LdDL = Ld->getDebugLoc(); 15466 DebugLoc StDL = N->getDebugLoc(); 15467 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 15468 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 15469 // pair instead. 15470 if (Subtarget->is64Bit() || F64IsLegal) { 15471 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 15472 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 15473 Ld->getPointerInfo(), Ld->isVolatile(), 15474 Ld->isNonTemporal(), Ld->isInvariant(), 15475 Ld->getAlignment()); 15476 SDValue NewChain = NewLd.getValue(1); 15477 if (TokenFactorIndex != -1) { 15478 Ops.push_back(NewChain); 15479 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 15480 Ops.size()); 15481 } 15482 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 15483 St->getPointerInfo(), 15484 St->isVolatile(), St->isNonTemporal(), 15485 St->getAlignment()); 15486 } 15487 15488 // Otherwise, lower to two pairs of 32-bit loads / stores. 15489 SDValue LoAddr = Ld->getBasePtr(); 15490 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 15491 DAG.getConstant(4, MVT::i32)); 15492 15493 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 15494 Ld->getPointerInfo(), 15495 Ld->isVolatile(), Ld->isNonTemporal(), 15496 Ld->isInvariant(), Ld->getAlignment()); 15497 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 15498 Ld->getPointerInfo().getWithOffset(4), 15499 Ld->isVolatile(), Ld->isNonTemporal(), 15500 Ld->isInvariant(), 15501 MinAlign(Ld->getAlignment(), 4)); 15502 15503 SDValue NewChain = LoLd.getValue(1); 15504 if (TokenFactorIndex != -1) { 15505 Ops.push_back(LoLd); 15506 Ops.push_back(HiLd); 15507 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 15508 Ops.size()); 15509 } 15510 15511 LoAddr = St->getBasePtr(); 15512 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 15513 DAG.getConstant(4, MVT::i32)); 15514 15515 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 15516 St->getPointerInfo(), 15517 St->isVolatile(), St->isNonTemporal(), 15518 St->getAlignment()); 15519 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 15520 St->getPointerInfo().getWithOffset(4), 15521 St->isVolatile(), 15522 St->isNonTemporal(), 15523 MinAlign(St->getAlignment(), 4)); 15524 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 15525 } 15526 return SDValue(); 15527} 15528 15529/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 15530/// and return the operands for the horizontal operation in LHS and RHS. A 15531/// horizontal operation performs the binary operation on successive elements 15532/// of its first operand, then on successive elements of its second operand, 15533/// returning the resulting values in a vector. For example, if 15534/// A = < float a0, float a1, float a2, float a3 > 15535/// and 15536/// B = < float b0, float b1, float b2, float b3 > 15537/// then the result of doing a horizontal operation on A and B is 15538/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 15539/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 15540/// A horizontal-op B, for some already available A and B, and if so then LHS is 15541/// set to A, RHS to B, and the routine returns 'true'. 15542/// Note that the binary operation should have the property that if one of the 15543/// operands is UNDEF then the result is UNDEF. 15544static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 15545 // Look for the following pattern: if 15546 // A = < float a0, float a1, float a2, float a3 > 15547 // B = < float b0, float b1, float b2, float b3 > 15548 // and 15549 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 15550 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 15551 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 15552 // which is A horizontal-op B. 15553 15554 // At least one of the operands should be a vector shuffle. 15555 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 15556 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 15557 return false; 15558 15559 EVT VT = LHS.getValueType(); 15560 15561 assert((VT.is128BitVector() || VT.is256BitVector()) && 15562 "Unsupported vector type for horizontal add/sub"); 15563 15564 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 15565 // operate independently on 128-bit lanes. 15566 unsigned NumElts = VT.getVectorNumElements(); 15567 unsigned NumLanes = VT.getSizeInBits()/128; 15568 unsigned NumLaneElts = NumElts / NumLanes; 15569 assert((NumLaneElts % 2 == 0) && 15570 "Vector type should have an even number of elements in each lane"); 15571 unsigned HalfLaneElts = NumLaneElts/2; 15572 15573 // View LHS in the form 15574 // LHS = VECTOR_SHUFFLE A, B, LMask 15575 // If LHS is not a shuffle then pretend it is the shuffle 15576 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 15577 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 15578 // type VT. 15579 SDValue A, B; 15580 SmallVector<int, 16> LMask(NumElts); 15581 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 15582 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 15583 A = LHS.getOperand(0); 15584 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 15585 B = LHS.getOperand(1); 15586 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 15587 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 15588 } else { 15589 if (LHS.getOpcode() != ISD::UNDEF) 15590 A = LHS; 15591 for (unsigned i = 0; i != NumElts; ++i) 15592 LMask[i] = i; 15593 } 15594 15595 // Likewise, view RHS in the form 15596 // RHS = VECTOR_SHUFFLE C, D, RMask 15597 SDValue C, D; 15598 SmallVector<int, 16> RMask(NumElts); 15599 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 15600 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 15601 C = RHS.getOperand(0); 15602 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 15603 D = RHS.getOperand(1); 15604 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 15605 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 15606 } else { 15607 if (RHS.getOpcode() != ISD::UNDEF) 15608 C = RHS; 15609 for (unsigned i = 0; i != NumElts; ++i) 15610 RMask[i] = i; 15611 } 15612 15613 // Check that the shuffles are both shuffling the same vectors. 15614 if (!(A == C && B == D) && !(A == D && B == C)) 15615 return false; 15616 15617 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 15618 if (!A.getNode() && !B.getNode()) 15619 return false; 15620 15621 // If A and B occur in reverse order in RHS, then "swap" them (which means 15622 // rewriting the mask). 15623 if (A != C) 15624 CommuteVectorShuffleMask(RMask, NumElts); 15625 15626 // At this point LHS and RHS are equivalent to 15627 // LHS = VECTOR_SHUFFLE A, B, LMask 15628 // RHS = VECTOR_SHUFFLE A, B, RMask 15629 // Check that the masks correspond to performing a horizontal operation. 15630 for (unsigned i = 0; i != NumElts; ++i) { 15631 int LIdx = LMask[i], RIdx = RMask[i]; 15632 15633 // Ignore any UNDEF components. 15634 if (LIdx < 0 || RIdx < 0 || 15635 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 15636 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 15637 continue; 15638 15639 // Check that successive elements are being operated on. If not, this is 15640 // not a horizontal operation. 15641 unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs 15642 unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; 15643 int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; 15644 if (!(LIdx == Index && RIdx == Index + 1) && 15645 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 15646 return false; 15647 } 15648 15649 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 15650 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 15651 return true; 15652} 15653 15654/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 15655static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 15656 const X86Subtarget *Subtarget) { 15657 EVT VT = N->getValueType(0); 15658 SDValue LHS = N->getOperand(0); 15659 SDValue RHS = N->getOperand(1); 15660 15661 // Try to synthesize horizontal adds from adds of shuffles. 15662 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 15663 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 15664 isHorizontalBinOp(LHS, RHS, true)) 15665 return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); 15666 return SDValue(); 15667} 15668 15669/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 15670static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 15671 const X86Subtarget *Subtarget) { 15672 EVT VT = N->getValueType(0); 15673 SDValue LHS = N->getOperand(0); 15674 SDValue RHS = N->getOperand(1); 15675 15676 // Try to synthesize horizontal subs from subs of shuffles. 15677 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 15678 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 15679 isHorizontalBinOp(LHS, RHS, false)) 15680 return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); 15681 return SDValue(); 15682} 15683 15684/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 15685/// X86ISD::FXOR nodes. 15686static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 15687 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 15688 // F[X]OR(0.0, x) -> x 15689 // F[X]OR(x, 0.0) -> x 15690 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 15691 if (C->getValueAPF().isPosZero()) 15692 return N->getOperand(1); 15693 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 15694 if (C->getValueAPF().isPosZero()) 15695 return N->getOperand(0); 15696 return SDValue(); 15697} 15698 15699/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 15700/// X86ISD::FMAX nodes. 15701static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 15702 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 15703 15704 // Only perform optimizations if UnsafeMath is used. 15705 if (!DAG.getTarget().Options.UnsafeFPMath) 15706 return SDValue(); 15707 15708 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 15709 // into FMINC and FMAXC, which are Commutative operations. 15710 unsigned NewOp = 0; 15711 switch (N->getOpcode()) { 15712 default: llvm_unreachable("unknown opcode"); 15713 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 15714 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 15715 } 15716 15717 return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), 15718 N->getOperand(0), N->getOperand(1)); 15719} 15720 15721 15722/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 15723static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 15724 // FAND(0.0, x) -> 0.0 15725 // FAND(x, 0.0) -> 0.0 15726 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 15727 if (C->getValueAPF().isPosZero()) 15728 return N->getOperand(0); 15729 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 15730 if (C->getValueAPF().isPosZero()) 15731 return N->getOperand(1); 15732 return SDValue(); 15733} 15734 15735static SDValue PerformBTCombine(SDNode *N, 15736 SelectionDAG &DAG, 15737 TargetLowering::DAGCombinerInfo &DCI) { 15738 // BT ignores high bits in the bit index operand. 15739 SDValue Op1 = N->getOperand(1); 15740 if (Op1.hasOneUse()) { 15741 unsigned BitWidth = Op1.getValueSizeInBits(); 15742 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 15743 APInt KnownZero, KnownOne; 15744 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 15745 !DCI.isBeforeLegalizeOps()); 15746 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15747 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 15748 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 15749 DCI.CommitTargetLoweringOpt(TLO); 15750 } 15751 return SDValue(); 15752} 15753 15754static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 15755 SDValue Op = N->getOperand(0); 15756 if (Op.getOpcode() == ISD::BITCAST) 15757 Op = Op.getOperand(0); 15758 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 15759 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 15760 VT.getVectorElementType().getSizeInBits() == 15761 OpVT.getVectorElementType().getSizeInBits()) { 15762 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 15763 } 15764 return SDValue(); 15765} 15766 15767static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 15768 TargetLowering::DAGCombinerInfo &DCI, 15769 const X86Subtarget *Subtarget) { 15770 if (!DCI.isBeforeLegalizeOps()) 15771 return SDValue(); 15772 15773 if (!Subtarget->hasAVX()) 15774 return SDValue(); 15775 15776 EVT VT = N->getValueType(0); 15777 SDValue Op = N->getOperand(0); 15778 EVT OpVT = Op.getValueType(); 15779 DebugLoc dl = N->getDebugLoc(); 15780 15781 if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || 15782 (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { 15783 15784 if (Subtarget->hasAVX2()) 15785 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); 15786 15787 // Optimize vectors in AVX mode 15788 // Sign extend v8i16 to v8i32 and 15789 // v4i32 to v4i64 15790 // 15791 // Divide input vector into two parts 15792 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 15793 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 15794 // concat the vectors to original VT 15795 15796 unsigned NumElems = OpVT.getVectorNumElements(); 15797 SDValue Undef = DAG.getUNDEF(OpVT); 15798 15799 SmallVector<int,8> ShufMask1(NumElems, -1); 15800 for (unsigned i = 0; i != NumElems/2; ++i) 15801 ShufMask1[i] = i; 15802 15803 SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); 15804 15805 SmallVector<int,8> ShufMask2(NumElems, -1); 15806 for (unsigned i = 0; i != NumElems/2; ++i) 15807 ShufMask2[i] = i + NumElems/2; 15808 15809 SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); 15810 15811 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 15812 VT.getVectorNumElements()/2); 15813 15814 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 15815 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 15816 15817 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 15818 } 15819 return SDValue(); 15820} 15821 15822static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 15823 const X86Subtarget* Subtarget) { 15824 DebugLoc dl = N->getDebugLoc(); 15825 EVT VT = N->getValueType(0); 15826 15827 // Let legalize expand this if it isn't a legal type yet. 15828 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 15829 return SDValue(); 15830 15831 EVT ScalarVT = VT.getScalarType(); 15832 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 15833 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 15834 return SDValue(); 15835 15836 SDValue A = N->getOperand(0); 15837 SDValue B = N->getOperand(1); 15838 SDValue C = N->getOperand(2); 15839 15840 bool NegA = (A.getOpcode() == ISD::FNEG); 15841 bool NegB = (B.getOpcode() == ISD::FNEG); 15842 bool NegC = (C.getOpcode() == ISD::FNEG); 15843 15844 // Negative multiplication when NegA xor NegB 15845 bool NegMul = (NegA != NegB); 15846 if (NegA) 15847 A = A.getOperand(0); 15848 if (NegB) 15849 B = B.getOperand(0); 15850 if (NegC) 15851 C = C.getOperand(0); 15852 15853 unsigned Opcode; 15854 if (!NegMul) 15855 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 15856 else 15857 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 15858 15859 return DAG.getNode(Opcode, dl, VT, A, B, C); 15860} 15861 15862static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 15863 TargetLowering::DAGCombinerInfo &DCI, 15864 const X86Subtarget *Subtarget) { 15865 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 15866 // (and (i32 x86isd::setcc_carry), 1) 15867 // This eliminates the zext. This transformation is necessary because 15868 // ISD::SETCC is always legalized to i8. 15869 DebugLoc dl = N->getDebugLoc(); 15870 SDValue N0 = N->getOperand(0); 15871 EVT VT = N->getValueType(0); 15872 EVT OpVT = N0.getValueType(); 15873 15874 if (N0.getOpcode() == ISD::AND && 15875 N0.hasOneUse() && 15876 N0.getOperand(0).hasOneUse()) { 15877 SDValue N00 = N0.getOperand(0); 15878 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 15879 return SDValue(); 15880 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 15881 if (!C || C->getZExtValue() != 1) 15882 return SDValue(); 15883 return DAG.getNode(ISD::AND, dl, VT, 15884 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 15885 N00.getOperand(0), N00.getOperand(1)), 15886 DAG.getConstant(1, VT)); 15887 } 15888 15889 // Optimize vectors in AVX mode: 15890 // 15891 // v8i16 -> v8i32 15892 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 15893 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 15894 // Concat upper and lower parts. 15895 // 15896 // v4i32 -> v4i64 15897 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 15898 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 15899 // Concat upper and lower parts. 15900 // 15901 if (!DCI.isBeforeLegalizeOps()) 15902 return SDValue(); 15903 15904 if (!Subtarget->hasAVX()) 15905 return SDValue(); 15906 15907 if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || 15908 ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { 15909 15910 if (Subtarget->hasAVX2()) 15911 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); 15912 15913 SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); 15914 SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); 15915 SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); 15916 15917 EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 15918 VT.getVectorNumElements()/2); 15919 15920 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 15921 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 15922 15923 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 15924 } 15925 15926 return SDValue(); 15927} 15928 15929// Optimize x == -y --> x+y == 0 15930// x != -y --> x+y != 0 15931static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 15932 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 15933 SDValue LHS = N->getOperand(0); 15934 SDValue RHS = N->getOperand(1); 15935 15936 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 15937 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 15938 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 15939 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 15940 LHS.getValueType(), RHS, LHS.getOperand(1)); 15941 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 15942 addV, DAG.getConstant(0, addV.getValueType()), CC); 15943 } 15944 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 15945 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 15946 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 15947 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 15948 RHS.getValueType(), LHS, RHS.getOperand(1)); 15949 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 15950 addV, DAG.getConstant(0, addV.getValueType()), CC); 15951 } 15952 return SDValue(); 15953} 15954 15955// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 15956static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 15957 TargetLowering::DAGCombinerInfo &DCI, 15958 const X86Subtarget *Subtarget) { 15959 DebugLoc DL = N->getDebugLoc(); 15960 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 15961 SDValue EFLAGS = N->getOperand(1); 15962 15963 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 15964 // a zext and produces an all-ones bit which is more useful than 0/1 in some 15965 // cases. 15966 if (CC == X86::COND_B) 15967 return DAG.getNode(ISD::AND, DL, MVT::i8, 15968 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 15969 DAG.getConstant(CC, MVT::i8), EFLAGS), 15970 DAG.getConstant(1, MVT::i8)); 15971 15972 SDValue Flags; 15973 15974 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 15975 if (Flags.getNode()) { 15976 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15977 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 15978 } 15979 15980 return SDValue(); 15981} 15982 15983// Optimize branch condition evaluation. 15984// 15985static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 15986 TargetLowering::DAGCombinerInfo &DCI, 15987 const X86Subtarget *Subtarget) { 15988 DebugLoc DL = N->getDebugLoc(); 15989 SDValue Chain = N->getOperand(0); 15990 SDValue Dest = N->getOperand(1); 15991 SDValue EFLAGS = N->getOperand(3); 15992 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 15993 15994 SDValue Flags; 15995 15996 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 15997 if (Flags.getNode()) { 15998 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15999 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 16000 Flags); 16001 } 16002 16003 return SDValue(); 16004} 16005 16006static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { 16007 SDValue Op0 = N->getOperand(0); 16008 EVT InVT = Op0->getValueType(0); 16009 16010 // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) 16011 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 16012 DebugLoc dl = N->getDebugLoc(); 16013 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 16014 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); 16015 // Notice that we use SINT_TO_FP because we know that the high bits 16016 // are zero and SINT_TO_FP is better supported by the hardware. 16017 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 16018 } 16019 16020 return SDValue(); 16021} 16022 16023static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 16024 const X86TargetLowering *XTLI) { 16025 SDValue Op0 = N->getOperand(0); 16026 EVT InVT = Op0->getValueType(0); 16027 16028 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 16029 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 16030 DebugLoc dl = N->getDebugLoc(); 16031 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 16032 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 16033 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 16034 } 16035 16036 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 16037 // a 32-bit target where SSE doesn't support i64->FP operations. 16038 if (Op0.getOpcode() == ISD::LOAD) { 16039 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 16040 EVT VT = Ld->getValueType(0); 16041 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 16042 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 16043 !XTLI->getSubtarget()->is64Bit() && 16044 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 16045 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 16046 Ld->getChain(), Op0, DAG); 16047 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 16048 return FILDChain; 16049 } 16050 } 16051 return SDValue(); 16052} 16053 16054static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { 16055 EVT VT = N->getValueType(0); 16056 16057 // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() 16058 if (VT == MVT::v8i8 || VT == MVT::v4i8) { 16059 DebugLoc dl = N->getDebugLoc(); 16060 MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 16061 SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); 16062 return DAG.getNode(ISD::TRUNCATE, dl, VT, I); 16063 } 16064 16065 return SDValue(); 16066} 16067 16068// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 16069static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 16070 X86TargetLowering::DAGCombinerInfo &DCI) { 16071 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 16072 // the result is either zero or one (depending on the input carry bit). 16073 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 16074 if (X86::isZeroNode(N->getOperand(0)) && 16075 X86::isZeroNode(N->getOperand(1)) && 16076 // We don't have a good way to replace an EFLAGS use, so only do this when 16077 // dead right now. 16078 SDValue(N, 1).use_empty()) { 16079 DebugLoc DL = N->getDebugLoc(); 16080 EVT VT = N->getValueType(0); 16081 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 16082 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 16083 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 16084 DAG.getConstant(X86::COND_B,MVT::i8), 16085 N->getOperand(2)), 16086 DAG.getConstant(1, VT)); 16087 return DCI.CombineTo(N, Res1, CarryOut); 16088 } 16089 16090 return SDValue(); 16091} 16092 16093// fold (add Y, (sete X, 0)) -> adc 0, Y 16094// (add Y, (setne X, 0)) -> sbb -1, Y 16095// (sub (sete X, 0), Y) -> sbb 0, Y 16096// (sub (setne X, 0), Y) -> adc -1, Y 16097static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 16098 DebugLoc DL = N->getDebugLoc(); 16099 16100 // Look through ZExts. 16101 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 16102 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 16103 return SDValue(); 16104 16105 SDValue SetCC = Ext.getOperand(0); 16106 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 16107 return SDValue(); 16108 16109 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 16110 if (CC != X86::COND_E && CC != X86::COND_NE) 16111 return SDValue(); 16112 16113 SDValue Cmp = SetCC.getOperand(1); 16114 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 16115 !X86::isZeroNode(Cmp.getOperand(1)) || 16116 !Cmp.getOperand(0).getValueType().isInteger()) 16117 return SDValue(); 16118 16119 SDValue CmpOp0 = Cmp.getOperand(0); 16120 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 16121 DAG.getConstant(1, CmpOp0.getValueType())); 16122 16123 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 16124 if (CC == X86::COND_NE) 16125 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 16126 DL, OtherVal.getValueType(), OtherVal, 16127 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 16128 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 16129 DL, OtherVal.getValueType(), OtherVal, 16130 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 16131} 16132 16133/// PerformADDCombine - Do target-specific dag combines on integer adds. 16134static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 16135 const X86Subtarget *Subtarget) { 16136 EVT VT = N->getValueType(0); 16137 SDValue Op0 = N->getOperand(0); 16138 SDValue Op1 = N->getOperand(1); 16139 16140 // Try to synthesize horizontal adds from adds of shuffles. 16141 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 16142 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 16143 isHorizontalBinOp(Op0, Op1, true)) 16144 return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); 16145 16146 return OptimizeConditionalInDecrement(N, DAG); 16147} 16148 16149static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 16150 const X86Subtarget *Subtarget) { 16151 SDValue Op0 = N->getOperand(0); 16152 SDValue Op1 = N->getOperand(1); 16153 16154 // X86 can't encode an immediate LHS of a sub. See if we can push the 16155 // negation into a preceding instruction. 16156 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 16157 // If the RHS of the sub is a XOR with one use and a constant, invert the 16158 // immediate. Then add one to the LHS of the sub so we can turn 16159 // X-Y -> X+~Y+1, saving one register. 16160 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 16161 isa<ConstantSDNode>(Op1.getOperand(1))) { 16162 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 16163 EVT VT = Op0.getValueType(); 16164 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 16165 Op1.getOperand(0), 16166 DAG.getConstant(~XorC, VT)); 16167 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 16168 DAG.getConstant(C->getAPIntValue()+1, VT)); 16169 } 16170 } 16171 16172 // Try to synthesize horizontal adds from adds of shuffles. 16173 EVT VT = N->getValueType(0); 16174 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 16175 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 16176 isHorizontalBinOp(Op0, Op1, true)) 16177 return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); 16178 16179 return OptimizeConditionalInDecrement(N, DAG); 16180} 16181 16182SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 16183 DAGCombinerInfo &DCI) const { 16184 SelectionDAG &DAG = DCI.DAG; 16185 switch (N->getOpcode()) { 16186 default: break; 16187 case ISD::EXTRACT_VECTOR_ELT: 16188 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 16189 case ISD::VSELECT: 16190 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 16191 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 16192 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 16193 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 16194 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 16195 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 16196 case ISD::SHL: 16197 case ISD::SRA: 16198 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 16199 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 16200 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 16201 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 16202 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 16203 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 16204 case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); 16205 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 16206 case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); 16207 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 16208 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 16209 case X86ISD::FXOR: 16210 case X86ISD::FOR: return PerformFORCombine(N, DAG); 16211 case X86ISD::FMIN: 16212 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 16213 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 16214 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 16215 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 16216 case ISD::ANY_EXTEND: 16217 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 16218 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 16219 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 16220 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 16221 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 16222 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 16223 case X86ISD::SHUFP: // Handle all target specific shuffles 16224 case X86ISD::PALIGN: 16225 case X86ISD::UNPCKH: 16226 case X86ISD::UNPCKL: 16227 case X86ISD::MOVHLPS: 16228 case X86ISD::MOVLHPS: 16229 case X86ISD::PSHUFD: 16230 case X86ISD::PSHUFHW: 16231 case X86ISD::PSHUFLW: 16232 case X86ISD::MOVSS: 16233 case X86ISD::MOVSD: 16234 case X86ISD::VPERMILP: 16235 case X86ISD::VPERM2X128: 16236 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 16237 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 16238 } 16239 16240 return SDValue(); 16241} 16242 16243/// isTypeDesirableForOp - Return true if the target has native support for 16244/// the specified value type and it is 'desirable' to use the type for the 16245/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 16246/// instruction encodings are longer and some i16 instructions are slow. 16247bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 16248 if (!isTypeLegal(VT)) 16249 return false; 16250 if (VT != MVT::i16) 16251 return true; 16252 16253 switch (Opc) { 16254 default: 16255 return true; 16256 case ISD::LOAD: 16257 case ISD::SIGN_EXTEND: 16258 case ISD::ZERO_EXTEND: 16259 case ISD::ANY_EXTEND: 16260 case ISD::SHL: 16261 case ISD::SRL: 16262 case ISD::SUB: 16263 case ISD::ADD: 16264 case ISD::MUL: 16265 case ISD::AND: 16266 case ISD::OR: 16267 case ISD::XOR: 16268 return false; 16269 } 16270} 16271 16272/// IsDesirableToPromoteOp - This method query the target whether it is 16273/// beneficial for dag combiner to promote the specified node. If true, it 16274/// should return the desired promotion type by reference. 16275bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 16276 EVT VT = Op.getValueType(); 16277 if (VT != MVT::i16) 16278 return false; 16279 16280 bool Promote = false; 16281 bool Commute = false; 16282 switch (Op.getOpcode()) { 16283 default: break; 16284 case ISD::LOAD: { 16285 LoadSDNode *LD = cast<LoadSDNode>(Op); 16286 // If the non-extending load has a single use and it's not live out, then it 16287 // might be folded. 16288 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 16289 Op.hasOneUse()*/) { 16290 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 16291 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 16292 // The only case where we'd want to promote LOAD (rather then it being 16293 // promoted as an operand is when it's only use is liveout. 16294 if (UI->getOpcode() != ISD::CopyToReg) 16295 return false; 16296 } 16297 } 16298 Promote = true; 16299 break; 16300 } 16301 case ISD::SIGN_EXTEND: 16302 case ISD::ZERO_EXTEND: 16303 case ISD::ANY_EXTEND: 16304 Promote = true; 16305 break; 16306 case ISD::SHL: 16307 case ISD::SRL: { 16308 SDValue N0 = Op.getOperand(0); 16309 // Look out for (store (shl (load), x)). 16310 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 16311 return false; 16312 Promote = true; 16313 break; 16314 } 16315 case ISD::ADD: 16316 case ISD::MUL: 16317 case ISD::AND: 16318 case ISD::OR: 16319 case ISD::XOR: 16320 Commute = true; 16321 // fallthrough 16322 case ISD::SUB: { 16323 SDValue N0 = Op.getOperand(0); 16324 SDValue N1 = Op.getOperand(1); 16325 if (!Commute && MayFoldLoad(N1)) 16326 return false; 16327 // Avoid disabling potential load folding opportunities. 16328 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 16329 return false; 16330 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 16331 return false; 16332 Promote = true; 16333 } 16334 } 16335 16336 PVT = MVT::i32; 16337 return Promote; 16338} 16339 16340//===----------------------------------------------------------------------===// 16341// X86 Inline Assembly Support 16342//===----------------------------------------------------------------------===// 16343 16344namespace { 16345 // Helper to match a string separated by whitespace. 16346 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 16347 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 16348 16349 for (unsigned i = 0, e = args.size(); i != e; ++i) { 16350 StringRef piece(*args[i]); 16351 if (!s.startswith(piece)) // Check if the piece matches. 16352 return false; 16353 16354 s = s.substr(piece.size()); 16355 StringRef::size_type pos = s.find_first_not_of(" \t"); 16356 if (pos == 0) // We matched a prefix. 16357 return false; 16358 16359 s = s.substr(pos); 16360 } 16361 16362 return s.empty(); 16363 } 16364 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 16365} 16366 16367bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 16368 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 16369 16370 std::string AsmStr = IA->getAsmString(); 16371 16372 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 16373 if (!Ty || Ty->getBitWidth() % 16 != 0) 16374 return false; 16375 16376 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 16377 SmallVector<StringRef, 4> AsmPieces; 16378 SplitString(AsmStr, AsmPieces, ";\n"); 16379 16380 switch (AsmPieces.size()) { 16381 default: return false; 16382 case 1: 16383 // FIXME: this should verify that we are targeting a 486 or better. If not, 16384 // we will turn this bswap into something that will be lowered to logical 16385 // ops instead of emitting the bswap asm. For now, we don't support 486 or 16386 // lower so don't worry about this. 16387 // bswap $0 16388 if (matchAsm(AsmPieces[0], "bswap", "$0") || 16389 matchAsm(AsmPieces[0], "bswapl", "$0") || 16390 matchAsm(AsmPieces[0], "bswapq", "$0") || 16391 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 16392 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 16393 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 16394 // No need to check constraints, nothing other than the equivalent of 16395 // "=r,0" would be valid here. 16396 return IntrinsicLowering::LowerToByteSwap(CI); 16397 } 16398 16399 // rorw $$8, ${0:w} --> llvm.bswap.i16 16400 if (CI->getType()->isIntegerTy(16) && 16401 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 16402 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 16403 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 16404 AsmPieces.clear(); 16405 const std::string &ConstraintsStr = IA->getConstraintString(); 16406 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 16407 std::sort(AsmPieces.begin(), AsmPieces.end()); 16408 if (AsmPieces.size() == 4 && 16409 AsmPieces[0] == "~{cc}" && 16410 AsmPieces[1] == "~{dirflag}" && 16411 AsmPieces[2] == "~{flags}" && 16412 AsmPieces[3] == "~{fpsr}") 16413 return IntrinsicLowering::LowerToByteSwap(CI); 16414 } 16415 break; 16416 case 3: 16417 if (CI->getType()->isIntegerTy(32) && 16418 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 16419 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 16420 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 16421 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 16422 AsmPieces.clear(); 16423 const std::string &ConstraintsStr = IA->getConstraintString(); 16424 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 16425 std::sort(AsmPieces.begin(), AsmPieces.end()); 16426 if (AsmPieces.size() == 4 && 16427 AsmPieces[0] == "~{cc}" && 16428 AsmPieces[1] == "~{dirflag}" && 16429 AsmPieces[2] == "~{flags}" && 16430 AsmPieces[3] == "~{fpsr}") 16431 return IntrinsicLowering::LowerToByteSwap(CI); 16432 } 16433 16434 if (CI->getType()->isIntegerTy(64)) { 16435 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 16436 if (Constraints.size() >= 2 && 16437 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 16438 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 16439 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 16440 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 16441 matchAsm(AsmPieces[1], "bswap", "%edx") && 16442 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 16443 return IntrinsicLowering::LowerToByteSwap(CI); 16444 } 16445 } 16446 break; 16447 } 16448 return false; 16449} 16450 16451 16452 16453/// getConstraintType - Given a constraint letter, return the type of 16454/// constraint it is for this target. 16455X86TargetLowering::ConstraintType 16456X86TargetLowering::getConstraintType(const std::string &Constraint) const { 16457 if (Constraint.size() == 1) { 16458 switch (Constraint[0]) { 16459 case 'R': 16460 case 'q': 16461 case 'Q': 16462 case 'f': 16463 case 't': 16464 case 'u': 16465 case 'y': 16466 case 'x': 16467 case 'Y': 16468 case 'l': 16469 return C_RegisterClass; 16470 case 'a': 16471 case 'b': 16472 case 'c': 16473 case 'd': 16474 case 'S': 16475 case 'D': 16476 case 'A': 16477 return C_Register; 16478 case 'I': 16479 case 'J': 16480 case 'K': 16481 case 'L': 16482 case 'M': 16483 case 'N': 16484 case 'G': 16485 case 'C': 16486 case 'e': 16487 case 'Z': 16488 return C_Other; 16489 default: 16490 break; 16491 } 16492 } 16493 return TargetLowering::getConstraintType(Constraint); 16494} 16495 16496/// Examine constraint type and operand type and determine a weight value. 16497/// This object must already have been set up with the operand type 16498/// and the current alternative constraint selected. 16499TargetLowering::ConstraintWeight 16500 X86TargetLowering::getSingleConstraintMatchWeight( 16501 AsmOperandInfo &info, const char *constraint) const { 16502 ConstraintWeight weight = CW_Invalid; 16503 Value *CallOperandVal = info.CallOperandVal; 16504 // If we don't have a value, we can't do a match, 16505 // but allow it at the lowest weight. 16506 if (CallOperandVal == NULL) 16507 return CW_Default; 16508 Type *type = CallOperandVal->getType(); 16509 // Look at the constraint type. 16510 switch (*constraint) { 16511 default: 16512 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 16513 case 'R': 16514 case 'q': 16515 case 'Q': 16516 case 'a': 16517 case 'b': 16518 case 'c': 16519 case 'd': 16520 case 'S': 16521 case 'D': 16522 case 'A': 16523 if (CallOperandVal->getType()->isIntegerTy()) 16524 weight = CW_SpecificReg; 16525 break; 16526 case 'f': 16527 case 't': 16528 case 'u': 16529 if (type->isFloatingPointTy()) 16530 weight = CW_SpecificReg; 16531 break; 16532 case 'y': 16533 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 16534 weight = CW_SpecificReg; 16535 break; 16536 case 'x': 16537 case 'Y': 16538 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 16539 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) 16540 weight = CW_Register; 16541 break; 16542 case 'I': 16543 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 16544 if (C->getZExtValue() <= 31) 16545 weight = CW_Constant; 16546 } 16547 break; 16548 case 'J': 16549 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16550 if (C->getZExtValue() <= 63) 16551 weight = CW_Constant; 16552 } 16553 break; 16554 case 'K': 16555 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16556 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 16557 weight = CW_Constant; 16558 } 16559 break; 16560 case 'L': 16561 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16562 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 16563 weight = CW_Constant; 16564 } 16565 break; 16566 case 'M': 16567 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16568 if (C->getZExtValue() <= 3) 16569 weight = CW_Constant; 16570 } 16571 break; 16572 case 'N': 16573 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16574 if (C->getZExtValue() <= 0xff) 16575 weight = CW_Constant; 16576 } 16577 break; 16578 case 'G': 16579 case 'C': 16580 if (dyn_cast<ConstantFP>(CallOperandVal)) { 16581 weight = CW_Constant; 16582 } 16583 break; 16584 case 'e': 16585 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16586 if ((C->getSExtValue() >= -0x80000000LL) && 16587 (C->getSExtValue() <= 0x7fffffffLL)) 16588 weight = CW_Constant; 16589 } 16590 break; 16591 case 'Z': 16592 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16593 if (C->getZExtValue() <= 0xffffffff) 16594 weight = CW_Constant; 16595 } 16596 break; 16597 } 16598 return weight; 16599} 16600 16601/// LowerXConstraint - try to replace an X constraint, which matches anything, 16602/// with another that has more specific requirements based on the type of the 16603/// corresponding operand. 16604const char *X86TargetLowering:: 16605LowerXConstraint(EVT ConstraintVT) const { 16606 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 16607 // 'f' like normal targets. 16608 if (ConstraintVT.isFloatingPoint()) { 16609 if (Subtarget->hasSSE2()) 16610 return "Y"; 16611 if (Subtarget->hasSSE1()) 16612 return "x"; 16613 } 16614 16615 return TargetLowering::LowerXConstraint(ConstraintVT); 16616} 16617 16618/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16619/// vector. If it is invalid, don't add anything to Ops. 16620void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16621 std::string &Constraint, 16622 std::vector<SDValue>&Ops, 16623 SelectionDAG &DAG) const { 16624 SDValue Result(0, 0); 16625 16626 // Only support length 1 constraints for now. 16627 if (Constraint.length() > 1) return; 16628 16629 char ConstraintLetter = Constraint[0]; 16630 switch (ConstraintLetter) { 16631 default: break; 16632 case 'I': 16633 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16634 if (C->getZExtValue() <= 31) { 16635 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16636 break; 16637 } 16638 } 16639 return; 16640 case 'J': 16641 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16642 if (C->getZExtValue() <= 63) { 16643 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16644 break; 16645 } 16646 } 16647 return; 16648 case 'K': 16649 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16650 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 16651 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16652 break; 16653 } 16654 } 16655 return; 16656 case 'N': 16657 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16658 if (C->getZExtValue() <= 255) { 16659 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16660 break; 16661 } 16662 } 16663 return; 16664 case 'e': { 16665 // 32-bit signed value 16666 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16667 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 16668 C->getSExtValue())) { 16669 // Widen to 64 bits here to get it sign extended. 16670 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 16671 break; 16672 } 16673 // FIXME gcc accepts some relocatable values here too, but only in certain 16674 // memory models; it's complicated. 16675 } 16676 return; 16677 } 16678 case 'Z': { 16679 // 32-bit unsigned value 16680 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16681 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 16682 C->getZExtValue())) { 16683 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16684 break; 16685 } 16686 } 16687 // FIXME gcc accepts some relocatable values here too, but only in certain 16688 // memory models; it's complicated. 16689 return; 16690 } 16691 case 'i': { 16692 // Literal immediates are always ok. 16693 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 16694 // Widen to 64 bits here to get it sign extended. 16695 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 16696 break; 16697 } 16698 16699 // In any sort of PIC mode addresses need to be computed at runtime by 16700 // adding in a register or some sort of table lookup. These can't 16701 // be used as immediates. 16702 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 16703 return; 16704 16705 // If we are in non-pic codegen mode, we allow the address of a global (with 16706 // an optional displacement) to be used with 'i'. 16707 GlobalAddressSDNode *GA = 0; 16708 int64_t Offset = 0; 16709 16710 // Match either (GA), (GA+C), (GA+C1+C2), etc. 16711 while (1) { 16712 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 16713 Offset += GA->getOffset(); 16714 break; 16715 } else if (Op.getOpcode() == ISD::ADD) { 16716 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 16717 Offset += C->getZExtValue(); 16718 Op = Op.getOperand(0); 16719 continue; 16720 } 16721 } else if (Op.getOpcode() == ISD::SUB) { 16722 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 16723 Offset += -C->getZExtValue(); 16724 Op = Op.getOperand(0); 16725 continue; 16726 } 16727 } 16728 16729 // Otherwise, this isn't something we can handle, reject it. 16730 return; 16731 } 16732 16733 const GlobalValue *GV = GA->getGlobal(); 16734 // If we require an extra load to get this address, as in PIC mode, we 16735 // can't accept it. 16736 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 16737 getTargetMachine()))) 16738 return; 16739 16740 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 16741 GA->getValueType(0), Offset); 16742 break; 16743 } 16744 } 16745 16746 if (Result.getNode()) { 16747 Ops.push_back(Result); 16748 return; 16749 } 16750 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16751} 16752 16753std::pair<unsigned, const TargetRegisterClass*> 16754X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 16755 EVT VT) const { 16756 // First, see if this is a constraint that directly corresponds to an LLVM 16757 // register class. 16758 if (Constraint.size() == 1) { 16759 // GCC Constraint Letters 16760 switch (Constraint[0]) { 16761 default: break; 16762 // TODO: Slight differences here in allocation order and leaving 16763 // RIP in the class. Do they matter any more here than they do 16764 // in the normal allocation? 16765 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 16766 if (Subtarget->is64Bit()) { 16767 if (VT == MVT::i32 || VT == MVT::f32) 16768 return std::make_pair(0U, &X86::GR32RegClass); 16769 if (VT == MVT::i16) 16770 return std::make_pair(0U, &X86::GR16RegClass); 16771 if (VT == MVT::i8 || VT == MVT::i1) 16772 return std::make_pair(0U, &X86::GR8RegClass); 16773 if (VT == MVT::i64 || VT == MVT::f64) 16774 return std::make_pair(0U, &X86::GR64RegClass); 16775 break; 16776 } 16777 // 32-bit fallthrough 16778 case 'Q': // Q_REGS 16779 if (VT == MVT::i32 || VT == MVT::f32) 16780 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 16781 if (VT == MVT::i16) 16782 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 16783 if (VT == MVT::i8 || VT == MVT::i1) 16784 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 16785 if (VT == MVT::i64) 16786 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 16787 break; 16788 case 'r': // GENERAL_REGS 16789 case 'l': // INDEX_REGS 16790 if (VT == MVT::i8 || VT == MVT::i1) 16791 return std::make_pair(0U, &X86::GR8RegClass); 16792 if (VT == MVT::i16) 16793 return std::make_pair(0U, &X86::GR16RegClass); 16794 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 16795 return std::make_pair(0U, &X86::GR32RegClass); 16796 return std::make_pair(0U, &X86::GR64RegClass); 16797 case 'R': // LEGACY_REGS 16798 if (VT == MVT::i8 || VT == MVT::i1) 16799 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 16800 if (VT == MVT::i16) 16801 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 16802 if (VT == MVT::i32 || !Subtarget->is64Bit()) 16803 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 16804 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 16805 case 'f': // FP Stack registers. 16806 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 16807 // value to the correct fpstack register class. 16808 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 16809 return std::make_pair(0U, &X86::RFP32RegClass); 16810 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 16811 return std::make_pair(0U, &X86::RFP64RegClass); 16812 return std::make_pair(0U, &X86::RFP80RegClass); 16813 case 'y': // MMX_REGS if MMX allowed. 16814 if (!Subtarget->hasMMX()) break; 16815 return std::make_pair(0U, &X86::VR64RegClass); 16816 case 'Y': // SSE_REGS if SSE2 allowed 16817 if (!Subtarget->hasSSE2()) break; 16818 // FALL THROUGH. 16819 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 16820 if (!Subtarget->hasSSE1()) break; 16821 16822 switch (VT.getSimpleVT().SimpleTy) { 16823 default: break; 16824 // Scalar SSE types. 16825 case MVT::f32: 16826 case MVT::i32: 16827 return std::make_pair(0U, &X86::FR32RegClass); 16828 case MVT::f64: 16829 case MVT::i64: 16830 return std::make_pair(0U, &X86::FR64RegClass); 16831 // Vector types. 16832 case MVT::v16i8: 16833 case MVT::v8i16: 16834 case MVT::v4i32: 16835 case MVT::v2i64: 16836 case MVT::v4f32: 16837 case MVT::v2f64: 16838 return std::make_pair(0U, &X86::VR128RegClass); 16839 // AVX types. 16840 case MVT::v32i8: 16841 case MVT::v16i16: 16842 case MVT::v8i32: 16843 case MVT::v4i64: 16844 case MVT::v8f32: 16845 case MVT::v4f64: 16846 return std::make_pair(0U, &X86::VR256RegClass); 16847 } 16848 break; 16849 } 16850 } 16851 16852 // Use the default implementation in TargetLowering to convert the register 16853 // constraint into a member of a register class. 16854 std::pair<unsigned, const TargetRegisterClass*> Res; 16855 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 16856 16857 // Not found as a standard register? 16858 if (Res.second == 0) { 16859 // Map st(0) -> st(7) -> ST0 16860 if (Constraint.size() == 7 && Constraint[0] == '{' && 16861 tolower(Constraint[1]) == 's' && 16862 tolower(Constraint[2]) == 't' && 16863 Constraint[3] == '(' && 16864 (Constraint[4] >= '0' && Constraint[4] <= '7') && 16865 Constraint[5] == ')' && 16866 Constraint[6] == '}') { 16867 16868 Res.first = X86::ST0+Constraint[4]-'0'; 16869 Res.second = &X86::RFP80RegClass; 16870 return Res; 16871 } 16872 16873 // GCC allows "st(0)" to be called just plain "st". 16874 if (StringRef("{st}").equals_lower(Constraint)) { 16875 Res.first = X86::ST0; 16876 Res.second = &X86::RFP80RegClass; 16877 return Res; 16878 } 16879 16880 // flags -> EFLAGS 16881 if (StringRef("{flags}").equals_lower(Constraint)) { 16882 Res.first = X86::EFLAGS; 16883 Res.second = &X86::CCRRegClass; 16884 return Res; 16885 } 16886 16887 // 'A' means EAX + EDX. 16888 if (Constraint == "A") { 16889 Res.first = X86::EAX; 16890 Res.second = &X86::GR32_ADRegClass; 16891 return Res; 16892 } 16893 return Res; 16894 } 16895 16896 // Otherwise, check to see if this is a register class of the wrong value 16897 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 16898 // turn into {ax},{dx}. 16899 if (Res.second->hasType(VT)) 16900 return Res; // Correct type already, nothing to do. 16901 16902 // All of the single-register GCC register classes map their values onto 16903 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 16904 // really want an 8-bit or 32-bit register, map to the appropriate register 16905 // class and return the appropriate register. 16906 if (Res.second == &X86::GR16RegClass) { 16907 if (VT == MVT::i8) { 16908 unsigned DestReg = 0; 16909 switch (Res.first) { 16910 default: break; 16911 case X86::AX: DestReg = X86::AL; break; 16912 case X86::DX: DestReg = X86::DL; break; 16913 case X86::CX: DestReg = X86::CL; break; 16914 case X86::BX: DestReg = X86::BL; break; 16915 } 16916 if (DestReg) { 16917 Res.first = DestReg; 16918 Res.second = &X86::GR8RegClass; 16919 } 16920 } else if (VT == MVT::i32) { 16921 unsigned DestReg = 0; 16922 switch (Res.first) { 16923 default: break; 16924 case X86::AX: DestReg = X86::EAX; break; 16925 case X86::DX: DestReg = X86::EDX; break; 16926 case X86::CX: DestReg = X86::ECX; break; 16927 case X86::BX: DestReg = X86::EBX; break; 16928 case X86::SI: DestReg = X86::ESI; break; 16929 case X86::DI: DestReg = X86::EDI; break; 16930 case X86::BP: DestReg = X86::EBP; break; 16931 case X86::SP: DestReg = X86::ESP; break; 16932 } 16933 if (DestReg) { 16934 Res.first = DestReg; 16935 Res.second = &X86::GR32RegClass; 16936 } 16937 } else if (VT == MVT::i64) { 16938 unsigned DestReg = 0; 16939 switch (Res.first) { 16940 default: break; 16941 case X86::AX: DestReg = X86::RAX; break; 16942 case X86::DX: DestReg = X86::RDX; break; 16943 case X86::CX: DestReg = X86::RCX; break; 16944 case X86::BX: DestReg = X86::RBX; break; 16945 case X86::SI: DestReg = X86::RSI; break; 16946 case X86::DI: DestReg = X86::RDI; break; 16947 case X86::BP: DestReg = X86::RBP; break; 16948 case X86::SP: DestReg = X86::RSP; break; 16949 } 16950 if (DestReg) { 16951 Res.first = DestReg; 16952 Res.second = &X86::GR64RegClass; 16953 } 16954 } 16955 } else if (Res.second == &X86::FR32RegClass || 16956 Res.second == &X86::FR64RegClass || 16957 Res.second == &X86::VR128RegClass) { 16958 // Handle references to XMM physical registers that got mapped into the 16959 // wrong class. This can happen with constraints like {xmm0} where the 16960 // target independent register mapper will just pick the first match it can 16961 // find, ignoring the required type. 16962 16963 if (VT == MVT::f32 || VT == MVT::i32) 16964 Res.second = &X86::FR32RegClass; 16965 else if (VT == MVT::f64 || VT == MVT::i64) 16966 Res.second = &X86::FR64RegClass; 16967 else if (X86::VR128RegClass.hasType(VT)) 16968 Res.second = &X86::VR128RegClass; 16969 else if (X86::VR256RegClass.hasType(VT)) 16970 Res.second = &X86::VR256RegClass; 16971 } 16972 16973 return Res; 16974} 16975