X86ISelLowering.cpp revision b1df5b013a38ab7381630af8b3142c56f604d85b
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "Utils/X86ShuffleDecode.h" 18#include "X86.h" 19#include "X86CallingConv.h" 20#include "X86InstrBuilder.h" 21#include "X86TargetMachine.h" 22#include "X86TargetObjectFile.h" 23#include "llvm/ADT/SmallSet.h" 24#include "llvm/ADT/Statistic.h" 25#include "llvm/ADT/StringExtras.h" 26#include "llvm/ADT/VariadicFunction.h" 27#include "llvm/CodeGen/IntrinsicLowering.h" 28#include "llvm/CodeGen/MachineFrameInfo.h" 29#include "llvm/CodeGen/MachineFunction.h" 30#include "llvm/CodeGen/MachineInstrBuilder.h" 31#include "llvm/CodeGen/MachineJumpTableInfo.h" 32#include "llvm/CodeGen/MachineModuleInfo.h" 33#include "llvm/CodeGen/MachineRegisterInfo.h" 34#include "llvm/IR/CallingConv.h" 35#include "llvm/IR/Constants.h" 36#include "llvm/IR/DerivedTypes.h" 37#include "llvm/IR/Function.h" 38#include "llvm/IR/GlobalAlias.h" 39#include "llvm/IR/GlobalVariable.h" 40#include "llvm/IR/Instructions.h" 41#include "llvm/IR/Intrinsics.h" 42#include "llvm/IR/LLVMContext.h" 43#include "llvm/MC/MCAsmInfo.h" 44#include "llvm/MC/MCContext.h" 45#include "llvm/MC/MCExpr.h" 46#include "llvm/MC/MCSymbol.h" 47#include "llvm/Support/CallSite.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Target/TargetOptions.h" 52#include <bitset> 53#include <cctype> 54using namespace llvm; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 63 SelectionDAG &DAG, SDLoc dl, 64 unsigned vectorWidth) { 65 assert((vectorWidth == 128 || vectorWidth == 256) && 66 "Unsupported vector width"); 67 EVT VT = Vec.getValueType(); 68 EVT ElVT = VT.getVectorElementType(); 69 unsigned Factor = VT.getSizeInBits()/vectorWidth; 70 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 71 VT.getVectorNumElements()/Factor); 72 73 // Extract from UNDEF is UNDEF. 74 if (Vec.getOpcode() == ISD::UNDEF) 75 return DAG.getUNDEF(ResultVT); 76 77 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 78 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 79 80 // This is the index of the first element of the vectorWidth-bit chunk 81 // we want. 82 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) 83 * ElemsPerChunk); 84 85 // If the input is a buildvector just emit a smaller one. 86 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 87 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 88 Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); 89 90 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 91 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 92 VecIdx); 93 94 return Result; 95 96} 97/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 98/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 99/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 100/// instructions or a simple subregister reference. Idx is an index in the 101/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes 102/// lowering EXTRACT_VECTOR_ELT operations easier. 103static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 104 SelectionDAG &DAG, SDLoc dl) { 105 assert((Vec.getValueType().is256BitVector() || 106 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 107 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 108} 109 110/// Generate a DAG to grab 256-bits from a 512-bit vector. 111static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 112 SelectionDAG &DAG, SDLoc dl) { 113 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 114 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 115} 116 117static SDValue InsertSubVector(SDValue Result, SDValue Vec, 118 unsigned IdxVal, SelectionDAG &DAG, 119 SDLoc dl, unsigned vectorWidth) { 120 assert((vectorWidth == 128 || vectorWidth == 256) && 121 "Unsupported vector width"); 122 // Inserting UNDEF is Result 123 if (Vec.getOpcode() == ISD::UNDEF) 124 return Result; 125 EVT VT = Vec.getValueType(); 126 EVT ElVT = VT.getVectorElementType(); 127 EVT ResultVT = Result.getValueType(); 128 129 // Insert the relevant vectorWidth bits. 130 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 131 132 // This is the index of the first element of the vectorWidth-bit chunk 133 // we want. 134 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) 135 * ElemsPerChunk); 136 137 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 138 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 139 VecIdx); 140} 141/// Generate a DAG to put 128-bits into a vector > 128 bits. This 142/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 143/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 144/// simple superregister reference. Idx is an index in the 128 bits 145/// we want. It need not be aligned to a 128-bit bounday. That makes 146/// lowering INSERT_VECTOR_ELT operations easier. 147static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 148 unsigned IdxVal, SelectionDAG &DAG, 149 SDLoc dl) { 150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 151 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 152} 153 154static SDValue Insert256BitVector(SDValue Result, SDValue Vec, 155 unsigned IdxVal, SelectionDAG &DAG, 156 SDLoc dl) { 157 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 158 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 159} 160 161/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 162/// instructions. This is used because creating CONCAT_VECTOR nodes of 163/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 164/// large BUILD_VECTORS. 165static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 166 unsigned NumElems, SelectionDAG &DAG, 167 SDLoc dl) { 168 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 169 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 170} 171 172static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 173 unsigned NumElems, SelectionDAG &DAG, 174 SDLoc dl) { 175 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 176 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 177} 178 179static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 180 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 181 bool is64Bit = Subtarget->is64Bit(); 182 183 if (Subtarget->isTargetEnvMacho()) { 184 if (is64Bit) 185 return new X86_64MachoTargetObjectFile(); 186 return new TargetLoweringObjectFileMachO(); 187 } 188 189 if (Subtarget->isTargetLinux()) 190 return new X86LinuxTargetObjectFile(); 191 if (Subtarget->isTargetELF()) 192 return new TargetLoweringObjectFileELF(); 193 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 194 return new TargetLoweringObjectFileCOFF(); 195 llvm_unreachable("unknown subtarget type"); 196} 197 198X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 199 : TargetLowering(TM, createTLOF(TM)) { 200 Subtarget = &TM.getSubtarget<X86Subtarget>(); 201 X86ScalarSSEf64 = Subtarget->hasSSE2(); 202 X86ScalarSSEf32 = Subtarget->hasSSE1(); 203 TD = getDataLayout(); 204 205 resetOperationActions(); 206} 207 208void X86TargetLowering::resetOperationActions() { 209 const TargetMachine &TM = getTargetMachine(); 210 static bool FirstTimeThrough = true; 211 212 // If none of the target options have changed, then we don't need to reset the 213 // operation actions. 214 if (!FirstTimeThrough && TO == TM.Options) return; 215 216 if (!FirstTimeThrough) { 217 // Reinitialize the actions. 218 initActions(); 219 FirstTimeThrough = false; 220 } 221 222 TO = TM.Options; 223 224 // Set up the TargetLowering object. 225 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 226 227 // X86 is weird, it always uses i8 for shift amounts and setcc results. 228 setBooleanContents(ZeroOrOneBooleanContent); 229 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 230 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 231 232 // For 64-bit since we have so many registers use the ILP scheduler, for 233 // 32-bit code use the register pressure specific scheduling. 234 // For Atom, always use ILP scheduling. 235 if (Subtarget->isAtom()) 236 setSchedulingPreference(Sched::ILP); 237 else if (Subtarget->is64Bit()) 238 setSchedulingPreference(Sched::ILP); 239 else 240 setSchedulingPreference(Sched::RegPressure); 241 const X86RegisterInfo *RegInfo = 242 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 243 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 244 245 // Bypass expensive divides on Atom when compiling with O2 246 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { 247 addBypassSlowDiv(32, 8); 248 if (Subtarget->is64Bit()) 249 addBypassSlowDiv(64, 16); 250 } 251 252 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 253 // Setup Windows compiler runtime calls. 254 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 255 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 256 setLibcallName(RTLIB::SREM_I64, "_allrem"); 257 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 258 setLibcallName(RTLIB::MUL_I64, "_allmul"); 259 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 260 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 261 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 262 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 263 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 264 265 // The _ftol2 runtime function has an unusual calling conv, which 266 // is modeled by a special pseudo-instruction. 267 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 268 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 269 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 270 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 271 } 272 273 if (Subtarget->isTargetDarwin()) { 274 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 275 setUseUnderscoreSetJmp(false); 276 setUseUnderscoreLongJmp(false); 277 } else if (Subtarget->isTargetMingw()) { 278 // MS runtime is weird: it exports _setjmp, but longjmp! 279 setUseUnderscoreSetJmp(true); 280 setUseUnderscoreLongJmp(false); 281 } else { 282 setUseUnderscoreSetJmp(true); 283 setUseUnderscoreLongJmp(true); 284 } 285 286 // Set up the register classes. 287 addRegisterClass(MVT::i8, &X86::GR8RegClass); 288 addRegisterClass(MVT::i16, &X86::GR16RegClass); 289 addRegisterClass(MVT::i32, &X86::GR32RegClass); 290 if (Subtarget->is64Bit()) 291 addRegisterClass(MVT::i64, &X86::GR64RegClass); 292 293 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 294 295 // We don't accept any truncstore of integer registers. 296 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 297 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 298 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 299 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 300 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 301 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 302 303 // SETOEQ and SETUNE require checking two conditions. 304 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 305 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 306 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 307 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 308 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 309 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 310 311 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 312 // operation. 313 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 314 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 315 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 316 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 319 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 320 } else if (!TM.Options.UseSoftFloat) { 321 // We have an algorithm for SSE2->double, and we turn this into a 322 // 64-bit FILD followed by conditional FADD for other targets. 323 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 324 // We have an algorithm for SSE2, and we turn this into a 64-bit 325 // FILD for other targets. 326 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 327 } 328 329 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 330 // this operation. 331 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 332 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 333 334 if (!TM.Options.UseSoftFloat) { 335 // SSE has no i16 to fp conversion, only i32 336 if (X86ScalarSSEf32) { 337 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 338 // f32 and f64 cases are Legal, f80 case is not 339 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 340 } else { 341 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 342 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 343 } 344 } else { 345 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 346 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 347 } 348 349 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 350 // are Legal, f80 is custom lowered. 351 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 352 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 353 354 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 355 // this operation. 356 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 357 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 358 359 if (X86ScalarSSEf32) { 360 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 361 // f32 and f64 cases are Legal, f80 case is not 362 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 363 } else { 364 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 365 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 366 } 367 368 // Handle FP_TO_UINT by promoting the destination to a larger signed 369 // conversion. 370 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 371 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 372 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 373 374 if (Subtarget->is64Bit()) { 375 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 376 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 377 } else if (!TM.Options.UseSoftFloat) { 378 // Since AVX is a superset of SSE3, only check for SSE here. 379 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 380 // Expand FP_TO_UINT into a select. 381 // FIXME: We would like to use a Custom expander here eventually to do 382 // the optimal thing for SSE vs. the default expansion in the legalizer. 383 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 384 else 385 // With SSE3 we can use fisttpll to convert to a signed i64; without 386 // SSE, we're stuck with a fistpll. 387 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 388 } 389 390 if (isTargetFTOL()) { 391 // Use the _ftol2 runtime function, which has a pseudo-instruction 392 // to handle its weird calling convention. 393 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 394 } 395 396 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 397 if (!X86ScalarSSEf64) { 398 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 399 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 400 if (Subtarget->is64Bit()) { 401 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 402 // Without SSE, i64->f64 goes through memory. 403 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 404 } 405 } 406 407 // Scalar integer divide and remainder are lowered to use operations that 408 // produce two results, to match the available instructions. This exposes 409 // the two-result form to trivial CSE, which is able to combine x/y and x%y 410 // into a single instruction. 411 // 412 // Scalar integer multiply-high is also lowered to use two-result 413 // operations, to match the available instructions. However, plain multiply 414 // (low) operations are left as Legal, as there are single-result 415 // instructions for this in x86. Using the two-result multiply instructions 416 // when both high and low results are needed must be arranged by dagcombine. 417 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 418 MVT VT = IntVTs[i]; 419 setOperationAction(ISD::MULHS, VT, Expand); 420 setOperationAction(ISD::MULHU, VT, Expand); 421 setOperationAction(ISD::SDIV, VT, Expand); 422 setOperationAction(ISD::UDIV, VT, Expand); 423 setOperationAction(ISD::SREM, VT, Expand); 424 setOperationAction(ISD::UREM, VT, Expand); 425 426 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 427 setOperationAction(ISD::ADDC, VT, Custom); 428 setOperationAction(ISD::ADDE, VT, Custom); 429 setOperationAction(ISD::SUBC, VT, Custom); 430 setOperationAction(ISD::SUBE, VT, Custom); 431 } 432 433 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 434 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 435 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 436 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 437 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 438 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 439 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 440 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 441 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 442 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 443 if (Subtarget->is64Bit()) 444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 445 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 446 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 447 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 448 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 449 setOperationAction(ISD::FREM , MVT::f32 , Expand); 450 setOperationAction(ISD::FREM , MVT::f64 , Expand); 451 setOperationAction(ISD::FREM , MVT::f80 , Expand); 452 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 453 454 // Promote the i8 variants and force them on up to i32 which has a shorter 455 // encoding. 456 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 457 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 458 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 459 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 460 if (Subtarget->hasBMI()) { 461 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 462 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 463 if (Subtarget->is64Bit()) 464 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 465 } else { 466 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 467 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 468 if (Subtarget->is64Bit()) 469 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 470 } 471 472 if (Subtarget->hasLZCNT()) { 473 // When promoting the i8 variants, force them to i32 for a shorter 474 // encoding. 475 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 476 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 478 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 479 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 480 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 481 if (Subtarget->is64Bit()) 482 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 483 } else { 484 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 485 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 486 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 487 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 488 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 489 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 490 if (Subtarget->is64Bit()) { 491 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 492 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 493 } 494 } 495 496 if (Subtarget->hasPOPCNT()) { 497 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 498 } else { 499 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 500 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 501 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 502 if (Subtarget->is64Bit()) 503 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 504 } 505 506 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 507 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 508 509 // These should be promoted to a larger select which is supported. 510 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 511 // X86 wants to expand cmov itself. 512 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 513 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 514 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 515 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 516 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 517 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 518 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 519 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 520 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 521 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 522 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 523 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 524 if (Subtarget->is64Bit()) { 525 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 526 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 527 } 528 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 529 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 530 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 531 // support continuation, user-level threading, and etc.. As a result, no 532 // other SjLj exception interfaces are implemented and please don't build 533 // your own exception handling based on them. 534 // LLVM/Clang supports zero-cost DWARF exception handling. 535 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 536 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 537 538 // Darwin ABI issue. 539 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 540 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 541 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 542 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 543 if (Subtarget->is64Bit()) 544 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 545 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 546 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 547 if (Subtarget->is64Bit()) { 548 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 549 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 550 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 551 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 552 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 553 } 554 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 555 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 556 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 557 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 558 if (Subtarget->is64Bit()) { 559 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 560 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 561 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 562 } 563 564 if (Subtarget->hasSSE1()) 565 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 566 567 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 568 569 // Expand certain atomics 570 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 571 MVT VT = IntVTs[i]; 572 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 573 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 574 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 575 } 576 577 if (!Subtarget->is64Bit()) { 578 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 579 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 580 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 581 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 582 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 583 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 584 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 585 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 586 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 587 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 588 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 589 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 590 } 591 592 if (Subtarget->hasCmpxchg16b()) { 593 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 594 } 595 596 // FIXME - use subtarget debug flags 597 if (!Subtarget->isTargetDarwin() && 598 !Subtarget->isTargetELF() && 599 !Subtarget->isTargetCygMing()) { 600 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 601 } 602 603 if (Subtarget->is64Bit()) { 604 setExceptionPointerRegister(X86::RAX); 605 setExceptionSelectorRegister(X86::RDX); 606 } else { 607 setExceptionPointerRegister(X86::EAX); 608 setExceptionSelectorRegister(X86::EDX); 609 } 610 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 611 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 612 613 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 614 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 615 616 setOperationAction(ISD::TRAP, MVT::Other, Legal); 617 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 618 619 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 620 setOperationAction(ISD::VASTART , MVT::Other, Custom); 621 setOperationAction(ISD::VAEND , MVT::Other, Expand); 622 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { 623 // TargetInfo::X86_64ABIBuiltinVaList 624 setOperationAction(ISD::VAARG , MVT::Other, Custom); 625 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 626 } else { 627 // TargetInfo::CharPtrBuiltinVaList 628 setOperationAction(ISD::VAARG , MVT::Other, Expand); 629 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 630 } 631 632 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 633 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 634 635 if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) 636 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 637 MVT::i64 : MVT::i32, Custom); 638 else if (TM.Options.EnableSegmentedStacks) 639 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 640 MVT::i64 : MVT::i32, Custom); 641 else 642 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 643 MVT::i64 : MVT::i32, Expand); 644 645 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 646 // f32 and f64 use SSE. 647 // Set up the FP register classes. 648 addRegisterClass(MVT::f32, &X86::FR32RegClass); 649 addRegisterClass(MVT::f64, &X86::FR64RegClass); 650 651 // Use ANDPD to simulate FABS. 652 setOperationAction(ISD::FABS , MVT::f64, Custom); 653 setOperationAction(ISD::FABS , MVT::f32, Custom); 654 655 // Use XORP to simulate FNEG. 656 setOperationAction(ISD::FNEG , MVT::f64, Custom); 657 setOperationAction(ISD::FNEG , MVT::f32, Custom); 658 659 // Use ANDPD and ORPD to simulate FCOPYSIGN. 660 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 661 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 662 663 // Lower this to FGETSIGNx86 plus an AND. 664 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 665 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 666 667 // We don't support sin/cos/fmod 668 setOperationAction(ISD::FSIN , MVT::f64, Expand); 669 setOperationAction(ISD::FCOS , MVT::f64, Expand); 670 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 671 setOperationAction(ISD::FSIN , MVT::f32, Expand); 672 setOperationAction(ISD::FCOS , MVT::f32, Expand); 673 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 674 675 // Expand FP immediates into loads from the stack, except for the special 676 // cases we handle. 677 addLegalFPImmediate(APFloat(+0.0)); // xorpd 678 addLegalFPImmediate(APFloat(+0.0f)); // xorps 679 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 680 // Use SSE for f32, x87 for f64. 681 // Set up the FP register classes. 682 addRegisterClass(MVT::f32, &X86::FR32RegClass); 683 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 684 685 // Use ANDPS to simulate FABS. 686 setOperationAction(ISD::FABS , MVT::f32, Custom); 687 688 // Use XORP to simulate FNEG. 689 setOperationAction(ISD::FNEG , MVT::f32, Custom); 690 691 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 692 693 // Use ANDPS and ORPS to simulate FCOPYSIGN. 694 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 695 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 696 697 // We don't support sin/cos/fmod 698 setOperationAction(ISD::FSIN , MVT::f32, Expand); 699 setOperationAction(ISD::FCOS , MVT::f32, Expand); 700 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 701 702 // Special cases we handle for FP constants. 703 addLegalFPImmediate(APFloat(+0.0f)); // xorps 704 addLegalFPImmediate(APFloat(+0.0)); // FLD0 705 addLegalFPImmediate(APFloat(+1.0)); // FLD1 706 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 707 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 708 709 if (!TM.Options.UnsafeFPMath) { 710 setOperationAction(ISD::FSIN , MVT::f64, Expand); 711 setOperationAction(ISD::FCOS , MVT::f64, Expand); 712 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 713 } 714 } else if (!TM.Options.UseSoftFloat) { 715 // f32 and f64 in x87. 716 // Set up the FP register classes. 717 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 718 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 719 720 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 721 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 722 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 723 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 724 725 if (!TM.Options.UnsafeFPMath) { 726 setOperationAction(ISD::FSIN , MVT::f64, Expand); 727 setOperationAction(ISD::FSIN , MVT::f32, Expand); 728 setOperationAction(ISD::FCOS , MVT::f64, Expand); 729 setOperationAction(ISD::FCOS , MVT::f32, Expand); 730 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 731 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 732 } 733 addLegalFPImmediate(APFloat(+0.0)); // FLD0 734 addLegalFPImmediate(APFloat(+1.0)); // FLD1 735 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 736 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 737 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 738 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 739 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 740 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 741 } 742 743 // We don't support FMA. 744 setOperationAction(ISD::FMA, MVT::f64, Expand); 745 setOperationAction(ISD::FMA, MVT::f32, Expand); 746 747 // Long double always uses X87. 748 if (!TM.Options.UseSoftFloat) { 749 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 750 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 751 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 752 { 753 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 754 addLegalFPImmediate(TmpFlt); // FLD0 755 TmpFlt.changeSign(); 756 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 757 758 bool ignored; 759 APFloat TmpFlt2(+1.0); 760 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 761 &ignored); 762 addLegalFPImmediate(TmpFlt2); // FLD1 763 TmpFlt2.changeSign(); 764 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 765 } 766 767 if (!TM.Options.UnsafeFPMath) { 768 setOperationAction(ISD::FSIN , MVT::f80, Expand); 769 setOperationAction(ISD::FCOS , MVT::f80, Expand); 770 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 771 } 772 773 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 774 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 775 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 776 setOperationAction(ISD::FRINT, MVT::f80, Expand); 777 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 778 setOperationAction(ISD::FMA, MVT::f80, Expand); 779 } 780 781 // Always use a library call for pow. 782 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 783 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 784 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 785 786 setOperationAction(ISD::FLOG, MVT::f80, Expand); 787 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 788 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 789 setOperationAction(ISD::FEXP, MVT::f80, Expand); 790 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 791 792 // First set operation action for all vector types to either promote 793 // (for widening) or expand (for scalarization). Then we will selectively 794 // turn on ones that can be effectively codegen'd. 795 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 796 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 797 MVT VT = (MVT::SimpleValueType)i; 798 setOperationAction(ISD::ADD , VT, Expand); 799 setOperationAction(ISD::SUB , VT, Expand); 800 setOperationAction(ISD::FADD, VT, Expand); 801 setOperationAction(ISD::FNEG, VT, Expand); 802 setOperationAction(ISD::FSUB, VT, Expand); 803 setOperationAction(ISD::MUL , VT, Expand); 804 setOperationAction(ISD::FMUL, VT, Expand); 805 setOperationAction(ISD::SDIV, VT, Expand); 806 setOperationAction(ISD::UDIV, VT, Expand); 807 setOperationAction(ISD::FDIV, VT, Expand); 808 setOperationAction(ISD::SREM, VT, Expand); 809 setOperationAction(ISD::UREM, VT, Expand); 810 setOperationAction(ISD::LOAD, VT, Expand); 811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 812 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 814 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 815 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 816 setOperationAction(ISD::FABS, VT, Expand); 817 setOperationAction(ISD::FSIN, VT, Expand); 818 setOperationAction(ISD::FSINCOS, VT, Expand); 819 setOperationAction(ISD::FCOS, VT, Expand); 820 setOperationAction(ISD::FSINCOS, VT, Expand); 821 setOperationAction(ISD::FREM, VT, Expand); 822 setOperationAction(ISD::FMA, VT, Expand); 823 setOperationAction(ISD::FPOWI, VT, Expand); 824 setOperationAction(ISD::FSQRT, VT, Expand); 825 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 826 setOperationAction(ISD::FFLOOR, VT, Expand); 827 setOperationAction(ISD::FCEIL, VT, Expand); 828 setOperationAction(ISD::FTRUNC, VT, Expand); 829 setOperationAction(ISD::FRINT, VT, Expand); 830 setOperationAction(ISD::FNEARBYINT, VT, Expand); 831 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 832 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 833 setOperationAction(ISD::SDIVREM, VT, Expand); 834 setOperationAction(ISD::UDIVREM, VT, Expand); 835 setOperationAction(ISD::FPOW, VT, Expand); 836 setOperationAction(ISD::CTPOP, VT, Expand); 837 setOperationAction(ISD::CTTZ, VT, Expand); 838 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 839 setOperationAction(ISD::CTLZ, VT, Expand); 840 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 841 setOperationAction(ISD::SHL, VT, Expand); 842 setOperationAction(ISD::SRA, VT, Expand); 843 setOperationAction(ISD::SRL, VT, Expand); 844 setOperationAction(ISD::ROTL, VT, Expand); 845 setOperationAction(ISD::ROTR, VT, Expand); 846 setOperationAction(ISD::BSWAP, VT, Expand); 847 setOperationAction(ISD::SETCC, VT, Expand); 848 setOperationAction(ISD::FLOG, VT, Expand); 849 setOperationAction(ISD::FLOG2, VT, Expand); 850 setOperationAction(ISD::FLOG10, VT, Expand); 851 setOperationAction(ISD::FEXP, VT, Expand); 852 setOperationAction(ISD::FEXP2, VT, Expand); 853 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 854 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 855 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 856 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 857 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 858 setOperationAction(ISD::TRUNCATE, VT, Expand); 859 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 860 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 861 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 862 setOperationAction(ISD::VSELECT, VT, Expand); 863 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 864 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 865 setTruncStoreAction(VT, 866 (MVT::SimpleValueType)InnerVT, Expand); 867 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 868 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 869 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 870 } 871 872 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 873 // with -msoft-float, disable use of MMX as well. 874 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 875 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 876 // No operations on x86mmx supported, everything uses intrinsics. 877 } 878 879 // MMX-sized vectors (other than x86mmx) are expected to be expanded 880 // into smaller operations. 881 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 882 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 883 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 884 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 885 setOperationAction(ISD::AND, MVT::v8i8, Expand); 886 setOperationAction(ISD::AND, MVT::v4i16, Expand); 887 setOperationAction(ISD::AND, MVT::v2i32, Expand); 888 setOperationAction(ISD::AND, MVT::v1i64, Expand); 889 setOperationAction(ISD::OR, MVT::v8i8, Expand); 890 setOperationAction(ISD::OR, MVT::v4i16, Expand); 891 setOperationAction(ISD::OR, MVT::v2i32, Expand); 892 setOperationAction(ISD::OR, MVT::v1i64, Expand); 893 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 894 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 895 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 896 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 897 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 898 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 899 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 900 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 901 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 902 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 903 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 904 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 905 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 906 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 907 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 908 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 909 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 910 911 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 912 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 913 914 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 915 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 916 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 917 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 918 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 919 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 920 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 921 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 922 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 923 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 925 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 926 } 927 928 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 929 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 930 931 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 932 // registers cannot be used even for integer operations. 933 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 934 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 935 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 936 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 937 938 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 939 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 940 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 941 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 942 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 943 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 944 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 945 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 946 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 947 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 948 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 949 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 950 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 951 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 952 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 953 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 954 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 955 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 956 957 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 958 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 959 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 960 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 961 962 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 963 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 964 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 965 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 966 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 967 968 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 969 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 970 MVT VT = (MVT::SimpleValueType)i; 971 // Do not attempt to custom lower non-power-of-2 vectors 972 if (!isPowerOf2_32(VT.getVectorNumElements())) 973 continue; 974 // Do not attempt to custom lower non-128-bit vectors 975 if (!VT.is128BitVector()) 976 continue; 977 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 978 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 979 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 980 } 981 982 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 983 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 984 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 985 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 986 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 987 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 988 989 if (Subtarget->is64Bit()) { 990 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 991 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 992 } 993 994 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 995 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 996 MVT VT = (MVT::SimpleValueType)i; 997 998 // Do not attempt to promote non-128-bit vectors 999 if (!VT.is128BitVector()) 1000 continue; 1001 1002 setOperationAction(ISD::AND, VT, Promote); 1003 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 1004 setOperationAction(ISD::OR, VT, Promote); 1005 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 1006 setOperationAction(ISD::XOR, VT, Promote); 1007 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 1008 setOperationAction(ISD::LOAD, VT, Promote); 1009 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 1010 setOperationAction(ISD::SELECT, VT, Promote); 1011 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 1012 } 1013 1014 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1015 1016 // Custom lower v2i64 and v2f64 selects. 1017 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 1018 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 1019 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 1020 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 1021 1022 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 1023 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 1024 1025 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 1026 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1027 // As there is no 64-bit GPR available, we need build a special custom 1028 // sequence to convert from v2i32 to v2f32. 1029 if (!Subtarget->is64Bit()) 1030 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 1031 1032 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 1033 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 1034 1035 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 1036 } 1037 1038 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { 1039 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1040 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1041 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1042 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1043 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1044 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1045 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1046 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1047 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1048 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1049 1050 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 1051 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 1052 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 1053 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 1054 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 1055 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 1056 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 1057 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 1058 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 1059 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 1060 1061 // FIXME: Do we need to handle scalar-to-vector here? 1062 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 1063 1064 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 1065 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 1066 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 1067 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 1068 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 1069 1070 // i8 and i16 vectors are custom , because the source register and source 1071 // source memory operand types are not the same width. f32 vectors are 1072 // custom since the immediate controlling the insert encodes additional 1073 // information. 1074 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1075 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1076 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1077 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1078 1079 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1080 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1081 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1082 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1083 1084 // FIXME: these should be Legal but thats only for the case where 1085 // the index is constant. For now custom expand to deal with that. 1086 if (Subtarget->is64Bit()) { 1087 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1088 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1089 } 1090 } 1091 1092 if (Subtarget->hasSSE2()) { 1093 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1094 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1095 1096 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1097 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1098 1099 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1100 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1101 1102 // In the customized shift lowering, the legal cases in AVX2 will be 1103 // recognized. 1104 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1105 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1106 1107 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1108 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1109 1110 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1111 1112 setOperationAction(ISD::SDIV, MVT::v8i16, Custom); 1113 setOperationAction(ISD::SDIV, MVT::v4i32, Custom); 1114 } 1115 1116 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1117 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1118 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1119 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1120 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1121 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1122 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1123 1124 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1125 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1126 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1127 1128 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1129 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1130 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1131 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1132 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1133 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1134 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1135 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1136 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1137 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1138 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1139 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1140 1141 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1142 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1143 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1144 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1145 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1146 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1147 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1148 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1149 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1150 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1151 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1152 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1153 1154 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 1155 1156 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1157 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 1158 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1159 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1160 1161 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1162 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1163 1164 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1165 1166 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1167 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1168 1169 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1170 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1171 1172 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1173 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1174 1175 setOperationAction(ISD::SDIV, MVT::v16i16, Custom); 1176 1177 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1178 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1179 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1180 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1181 1182 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1183 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1184 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1185 1186 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1187 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1188 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1189 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1190 1191 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1192 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1193 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1194 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1195 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1196 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1197 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1198 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1199 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1200 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1201 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1202 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1203 1204 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1205 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1206 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1207 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1208 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1209 setOperationAction(ISD::FMA, MVT::f32, Legal); 1210 setOperationAction(ISD::FMA, MVT::f64, Legal); 1211 } 1212 1213 if (Subtarget->hasInt256()) { 1214 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1215 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1216 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1217 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1218 1219 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1220 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1221 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1222 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1223 1224 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1225 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1226 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1227 // Don't lower v32i8 because there is no 128-bit byte mul 1228 1229 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1230 1231 setOperationAction(ISD::SDIV, MVT::v8i32, Custom); 1232 } else { 1233 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1234 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1235 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1236 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1237 1238 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1239 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1240 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1241 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1242 1243 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1244 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1245 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1246 // Don't lower v32i8 because there is no 128-bit byte mul 1247 } 1248 1249 // In the customized shift lowering, the legal cases in AVX2 will be 1250 // recognized. 1251 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1252 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1253 1254 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1255 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1256 1257 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1258 1259 // Custom lower several nodes for 256-bit types. 1260 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1261 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1262 MVT VT = (MVT::SimpleValueType)i; 1263 1264 // Extract subvector is special because the value type 1265 // (result) is 128-bit but the source is 256-bit wide. 1266 if (VT.is128BitVector()) 1267 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1268 1269 // Do not attempt to custom lower other non-256-bit vectors 1270 if (!VT.is256BitVector()) 1271 continue; 1272 1273 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1274 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1275 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1276 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1277 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1278 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1279 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1280 } 1281 1282 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1283 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1284 MVT VT = (MVT::SimpleValueType)i; 1285 1286 // Do not attempt to promote non-256-bit vectors 1287 if (!VT.is256BitVector()) 1288 continue; 1289 1290 setOperationAction(ISD::AND, VT, Promote); 1291 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1292 setOperationAction(ISD::OR, VT, Promote); 1293 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1294 setOperationAction(ISD::XOR, VT, Promote); 1295 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1296 setOperationAction(ISD::LOAD, VT, Promote); 1297 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1298 setOperationAction(ISD::SELECT, VT, Promote); 1299 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1300 } 1301 } 1302 1303 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { 1304 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1305 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1306 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1307 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1308 1309 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1310 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1311 1312 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); 1313 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 1314 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 1315 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 1316 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 1317 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 1318 1319 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 1320 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 1321 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 1322 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 1323 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 1324 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 1325 1326 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 1327 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 1328 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 1329 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 1330 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 1331 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 1332 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 1333 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 1334 setOperationAction(ISD::SDIV, MVT::v16i32, Custom); 1335 1336 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 1337 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 1338 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 1339 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 1340 if (Subtarget->is64Bit()) { 1341 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); 1342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); 1343 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); 1344 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); 1345 } 1346 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1347 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1348 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1349 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1350 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1351 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1352 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1353 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1354 1355 setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); 1356 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1357 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1358 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1359 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1360 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1361 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1362 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1363 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1364 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1365 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1366 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1367 1368 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1369 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1370 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1371 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1372 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1373 1374 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1375 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1376 1377 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1378 1379 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1380 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1381 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1382 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1383 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1384 1385 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 1386 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 1387 1388 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 1389 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 1390 1391 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1392 1393 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 1394 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 1395 1396 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 1397 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 1398 1399 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 1400 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 1401 1402 setOperationAction(ISD::AND, MVT::v8i64, Legal); 1403 setOperationAction(ISD::OR, MVT::v8i64, Legal); 1404 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 1405 setOperationAction(ISD::AND, MVT::v16i32, Legal); 1406 setOperationAction(ISD::OR, MVT::v16i32, Legal); 1407 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 1408 1409 // Custom lower several nodes. 1410 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1411 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1412 MVT VT = (MVT::SimpleValueType)i; 1413 1414 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1415 // Extract subvector is special because the value type 1416 // (result) is 256/128-bit but the source is 512-bit wide. 1417 if (VT.is128BitVector() || VT.is256BitVector()) 1418 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1419 1420 if (VT.getVectorElementType() == MVT::i1) 1421 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1422 1423 // Do not attempt to custom lower other non-512-bit vectors 1424 if (!VT.is512BitVector()) 1425 continue; 1426 1427 if ( EltSize >= 32) { 1428 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1429 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1430 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1431 setOperationAction(ISD::VSELECT, VT, Legal); 1432 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1433 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1434 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1435 } 1436 } 1437 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 1438 MVT VT = (MVT::SimpleValueType)i; 1439 1440 // Do not attempt to promote non-256-bit vectors 1441 if (!VT.is512BitVector()) 1442 continue; 1443 1444 setOperationAction(ISD::SELECT, VT, Promote); 1445 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 1446 } 1447 }// has AVX-512 1448 1449 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1450 // of this type with custom code. 1451 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1452 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1453 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1454 Custom); 1455 } 1456 1457 // We want to custom lower some of our intrinsics. 1458 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1459 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1460 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1461 1462 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1463 // handle type legalization for these operations here. 1464 // 1465 // FIXME: We really should do custom legalization for addition and 1466 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1467 // than generic legalization for 64-bit multiplication-with-overflow, though. 1468 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1469 // Add/Sub/Mul with overflow operations are custom lowered. 1470 MVT VT = IntVTs[i]; 1471 setOperationAction(ISD::SADDO, VT, Custom); 1472 setOperationAction(ISD::UADDO, VT, Custom); 1473 setOperationAction(ISD::SSUBO, VT, Custom); 1474 setOperationAction(ISD::USUBO, VT, Custom); 1475 setOperationAction(ISD::SMULO, VT, Custom); 1476 setOperationAction(ISD::UMULO, VT, Custom); 1477 } 1478 1479 // There are no 8-bit 3-address imul/mul instructions 1480 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1481 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1482 1483 if (!Subtarget->is64Bit()) { 1484 // These libcalls are not available in 32-bit. 1485 setLibcallName(RTLIB::SHL_I128, 0); 1486 setLibcallName(RTLIB::SRL_I128, 0); 1487 setLibcallName(RTLIB::SRA_I128, 0); 1488 } 1489 1490 // Combine sin / cos into one node or libcall if possible. 1491 if (Subtarget->hasSinCos()) { 1492 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1493 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1494 if (Subtarget->isTargetDarwin()) { 1495 // For MacOSX, we don't want to the normal expansion of a libcall to 1496 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 1497 // traffic. 1498 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1499 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1500 } 1501 } 1502 1503 // We have target-specific dag combine patterns for the following nodes: 1504 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1505 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1506 setTargetDAGCombine(ISD::VSELECT); 1507 setTargetDAGCombine(ISD::SELECT); 1508 setTargetDAGCombine(ISD::SHL); 1509 setTargetDAGCombine(ISD::SRA); 1510 setTargetDAGCombine(ISD::SRL); 1511 setTargetDAGCombine(ISD::OR); 1512 setTargetDAGCombine(ISD::AND); 1513 setTargetDAGCombine(ISD::ADD); 1514 setTargetDAGCombine(ISD::FADD); 1515 setTargetDAGCombine(ISD::FSUB); 1516 setTargetDAGCombine(ISD::FMA); 1517 setTargetDAGCombine(ISD::SUB); 1518 setTargetDAGCombine(ISD::LOAD); 1519 setTargetDAGCombine(ISD::STORE); 1520 setTargetDAGCombine(ISD::ZERO_EXTEND); 1521 setTargetDAGCombine(ISD::ANY_EXTEND); 1522 setTargetDAGCombine(ISD::SIGN_EXTEND); 1523 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1524 setTargetDAGCombine(ISD::TRUNCATE); 1525 setTargetDAGCombine(ISD::SINT_TO_FP); 1526 setTargetDAGCombine(ISD::SETCC); 1527 if (Subtarget->is64Bit()) 1528 setTargetDAGCombine(ISD::MUL); 1529 setTargetDAGCombine(ISD::XOR); 1530 1531 computeRegisterProperties(); 1532 1533 // On Darwin, -Os means optimize for size without hurting performance, 1534 // do not reduce the limit. 1535 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1536 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1537 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1538 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1539 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1540 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1541 setPrefLoopAlignment(4); // 2^4 bytes. 1542 1543 // Predictable cmov don't hurt on atom because it's in-order. 1544 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1545 1546 setPrefFunctionAlignment(4); // 2^4 bytes. 1547} 1548 1549EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1550 if (!VT.isVector()) 1551 return MVT::i8; 1552 1553 const TargetMachine &TM = getTargetMachine(); 1554 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) 1555 switch(VT.getVectorNumElements()) { 1556 case 8: return MVT::v8i1; 1557 case 16: return MVT::v16i1; 1558 } 1559 1560 return VT.changeVectorElementTypeToInteger(); 1561} 1562 1563/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1564/// the desired ByVal argument alignment. 1565static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1566 if (MaxAlign == 16) 1567 return; 1568 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1569 if (VTy->getBitWidth() == 128) 1570 MaxAlign = 16; 1571 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1572 unsigned EltAlign = 0; 1573 getMaxByValAlign(ATy->getElementType(), EltAlign); 1574 if (EltAlign > MaxAlign) 1575 MaxAlign = EltAlign; 1576 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1577 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1578 unsigned EltAlign = 0; 1579 getMaxByValAlign(STy->getElementType(i), EltAlign); 1580 if (EltAlign > MaxAlign) 1581 MaxAlign = EltAlign; 1582 if (MaxAlign == 16) 1583 break; 1584 } 1585 } 1586} 1587 1588/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1589/// function arguments in the caller parameter area. For X86, aggregates 1590/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1591/// are at 4-byte boundaries. 1592unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1593 if (Subtarget->is64Bit()) { 1594 // Max of 8 and alignment of type. 1595 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1596 if (TyAlign > 8) 1597 return TyAlign; 1598 return 8; 1599 } 1600 1601 unsigned Align = 4; 1602 if (Subtarget->hasSSE1()) 1603 getMaxByValAlign(Ty, Align); 1604 return Align; 1605} 1606 1607/// getOptimalMemOpType - Returns the target specific optimal type for load 1608/// and store operations as a result of memset, memcpy, and memmove 1609/// lowering. If DstAlign is zero that means it's safe to destination 1610/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1611/// means there isn't a need to check it against alignment requirement, 1612/// probably because the source does not need to be loaded. If 'IsMemset' is 1613/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1614/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1615/// source is constant so it does not need to be loaded. 1616/// It returns EVT::Other if the type should be determined using generic 1617/// target-independent logic. 1618EVT 1619X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1620 unsigned DstAlign, unsigned SrcAlign, 1621 bool IsMemset, bool ZeroMemset, 1622 bool MemcpyStrSrc, 1623 MachineFunction &MF) const { 1624 const Function *F = MF.getFunction(); 1625 if ((!IsMemset || ZeroMemset) && 1626 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 1627 Attribute::NoImplicitFloat)) { 1628 if (Size >= 16 && 1629 (Subtarget->isUnalignedMemAccessFast() || 1630 ((DstAlign == 0 || DstAlign >= 16) && 1631 (SrcAlign == 0 || SrcAlign >= 16)))) { 1632 if (Size >= 32) { 1633 if (Subtarget->hasInt256()) 1634 return MVT::v8i32; 1635 if (Subtarget->hasFp256()) 1636 return MVT::v8f32; 1637 } 1638 if (Subtarget->hasSSE2()) 1639 return MVT::v4i32; 1640 if (Subtarget->hasSSE1()) 1641 return MVT::v4f32; 1642 } else if (!MemcpyStrSrc && Size >= 8 && 1643 !Subtarget->is64Bit() && 1644 Subtarget->hasSSE2()) { 1645 // Do not use f64 to lower memcpy if source is string constant. It's 1646 // better to use i32 to avoid the loads. 1647 return MVT::f64; 1648 } 1649 } 1650 if (Subtarget->is64Bit() && Size >= 8) 1651 return MVT::i64; 1652 return MVT::i32; 1653} 1654 1655bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1656 if (VT == MVT::f32) 1657 return X86ScalarSSEf32; 1658 else if (VT == MVT::f64) 1659 return X86ScalarSSEf64; 1660 return true; 1661} 1662 1663bool 1664X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { 1665 if (Fast) 1666 *Fast = Subtarget->isUnalignedMemAccessFast(); 1667 return true; 1668} 1669 1670/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1671/// current function. The returned value is a member of the 1672/// MachineJumpTableInfo::JTEntryKind enum. 1673unsigned X86TargetLowering::getJumpTableEncoding() const { 1674 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1675 // symbol. 1676 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1677 Subtarget->isPICStyleGOT()) 1678 return MachineJumpTableInfo::EK_Custom32; 1679 1680 // Otherwise, use the normal jump table encoding heuristics. 1681 return TargetLowering::getJumpTableEncoding(); 1682} 1683 1684const MCExpr * 1685X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1686 const MachineBasicBlock *MBB, 1687 unsigned uid,MCContext &Ctx) const{ 1688 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1689 Subtarget->isPICStyleGOT()); 1690 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1691 // entries. 1692 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1693 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1694} 1695 1696/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1697/// jumptable. 1698SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1699 SelectionDAG &DAG) const { 1700 if (!Subtarget->is64Bit()) 1701 // This doesn't have SDLoc associated with it, but is not really the 1702 // same as a Register. 1703 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); 1704 return Table; 1705} 1706 1707/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1708/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1709/// MCExpr. 1710const MCExpr *X86TargetLowering:: 1711getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1712 MCContext &Ctx) const { 1713 // X86-64 uses RIP relative addressing based on the jump table label. 1714 if (Subtarget->isPICStyleRIPRel()) 1715 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1716 1717 // Otherwise, the reference is relative to the PIC base. 1718 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1719} 1720 1721// FIXME: Why this routine is here? Move to RegInfo! 1722std::pair<const TargetRegisterClass*, uint8_t> 1723X86TargetLowering::findRepresentativeClass(MVT VT) const{ 1724 const TargetRegisterClass *RRC = 0; 1725 uint8_t Cost = 1; 1726 switch (VT.SimpleTy) { 1727 default: 1728 return TargetLowering::findRepresentativeClass(VT); 1729 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1730 RRC = Subtarget->is64Bit() ? 1731 (const TargetRegisterClass*)&X86::GR64RegClass : 1732 (const TargetRegisterClass*)&X86::GR32RegClass; 1733 break; 1734 case MVT::x86mmx: 1735 RRC = &X86::VR64RegClass; 1736 break; 1737 case MVT::f32: case MVT::f64: 1738 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1739 case MVT::v4f32: case MVT::v2f64: 1740 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1741 case MVT::v4f64: 1742 RRC = &X86::VR128RegClass; 1743 break; 1744 } 1745 return std::make_pair(RRC, Cost); 1746} 1747 1748bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1749 unsigned &Offset) const { 1750 if (!Subtarget->isTargetLinux()) 1751 return false; 1752 1753 if (Subtarget->is64Bit()) { 1754 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1755 Offset = 0x28; 1756 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1757 AddressSpace = 256; 1758 else 1759 AddressSpace = 257; 1760 } else { 1761 // %gs:0x14 on i386 1762 Offset = 0x14; 1763 AddressSpace = 256; 1764 } 1765 return true; 1766} 1767 1768bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 1769 unsigned DestAS) const { 1770 assert(SrcAS != DestAS && "Expected different address spaces!"); 1771 1772 return SrcAS < 256 && DestAS < 256; 1773} 1774 1775//===----------------------------------------------------------------------===// 1776// Return Value Calling Convention Implementation 1777//===----------------------------------------------------------------------===// 1778 1779#include "X86GenCallingConv.inc" 1780 1781bool 1782X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1783 MachineFunction &MF, bool isVarArg, 1784 const SmallVectorImpl<ISD::OutputArg> &Outs, 1785 LLVMContext &Context) const { 1786 SmallVector<CCValAssign, 16> RVLocs; 1787 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1788 RVLocs, Context); 1789 return CCInfo.CheckReturn(Outs, RetCC_X86); 1790} 1791 1792const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 1793 static const uint16_t ScratchRegs[] = { X86::R11, 0 }; 1794 return ScratchRegs; 1795} 1796 1797SDValue 1798X86TargetLowering::LowerReturn(SDValue Chain, 1799 CallingConv::ID CallConv, bool isVarArg, 1800 const SmallVectorImpl<ISD::OutputArg> &Outs, 1801 const SmallVectorImpl<SDValue> &OutVals, 1802 SDLoc dl, SelectionDAG &DAG) const { 1803 MachineFunction &MF = DAG.getMachineFunction(); 1804 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1805 1806 SmallVector<CCValAssign, 16> RVLocs; 1807 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1808 RVLocs, *DAG.getContext()); 1809 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1810 1811 SDValue Flag; 1812 SmallVector<SDValue, 6> RetOps; 1813 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1814 // Operand #1 = Bytes To Pop 1815 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1816 MVT::i16)); 1817 1818 // Copy the result values into the output registers. 1819 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1820 CCValAssign &VA = RVLocs[i]; 1821 assert(VA.isRegLoc() && "Can only return in registers!"); 1822 SDValue ValToCopy = OutVals[i]; 1823 EVT ValVT = ValToCopy.getValueType(); 1824 1825 // Promote values to the appropriate types 1826 if (VA.getLocInfo() == CCValAssign::SExt) 1827 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1828 else if (VA.getLocInfo() == CCValAssign::ZExt) 1829 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1830 else if (VA.getLocInfo() == CCValAssign::AExt) 1831 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1832 else if (VA.getLocInfo() == CCValAssign::BCvt) 1833 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1834 1835 // If this is x86-64, and we disabled SSE, we can't return FP values, 1836 // or SSE or MMX vectors. 1837 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1838 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1839 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1840 report_fatal_error("SSE register return with SSE disabled"); 1841 } 1842 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1843 // llvm-gcc has never done it right and no one has noticed, so this 1844 // should be OK for now. 1845 if (ValVT == MVT::f64 && 1846 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1847 report_fatal_error("SSE2 register return with SSE2 disabled"); 1848 1849 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1850 // the RET instruction and handled by the FP Stackifier. 1851 if (VA.getLocReg() == X86::ST0 || 1852 VA.getLocReg() == X86::ST1) { 1853 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1854 // change the value to the FP stack register class. 1855 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1856 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1857 RetOps.push_back(ValToCopy); 1858 // Don't emit a copytoreg. 1859 continue; 1860 } 1861 1862 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1863 // which is returned in RAX / RDX. 1864 if (Subtarget->is64Bit()) { 1865 if (ValVT == MVT::x86mmx) { 1866 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1867 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1868 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1869 ValToCopy); 1870 // If we don't have SSE2 available, convert to v4f32 so the generated 1871 // register is legal. 1872 if (!Subtarget->hasSSE2()) 1873 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1874 } 1875 } 1876 } 1877 1878 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1879 Flag = Chain.getValue(1); 1880 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1881 } 1882 1883 // The x86-64 ABIs require that for returning structs by value we copy 1884 // the sret argument into %rax/%eax (depending on ABI) for the return. 1885 // Win32 requires us to put the sret argument to %eax as well. 1886 // We saved the argument into a virtual register in the entry block, 1887 // so now we copy the value out and into %rax/%eax. 1888 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && 1889 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 1890 MachineFunction &MF = DAG.getMachineFunction(); 1891 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1892 unsigned Reg = FuncInfo->getSRetReturnReg(); 1893 assert(Reg && 1894 "SRetReturnReg should have been set in LowerFormalArguments()."); 1895 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1896 1897 unsigned RetValReg 1898 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 1899 X86::RAX : X86::EAX; 1900 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 1901 Flag = Chain.getValue(1); 1902 1903 // RAX/EAX now acts like a return value. 1904 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); 1905 } 1906 1907 RetOps[0] = Chain; // Update chain. 1908 1909 // Add the flag if we have it. 1910 if (Flag.getNode()) 1911 RetOps.push_back(Flag); 1912 1913 return DAG.getNode(X86ISD::RET_FLAG, dl, 1914 MVT::Other, &RetOps[0], RetOps.size()); 1915} 1916 1917bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1918 if (N->getNumValues() != 1) 1919 return false; 1920 if (!N->hasNUsesOfValue(1, 0)) 1921 return false; 1922 1923 SDValue TCChain = Chain; 1924 SDNode *Copy = *N->use_begin(); 1925 if (Copy->getOpcode() == ISD::CopyToReg) { 1926 // If the copy has a glue operand, we conservatively assume it isn't safe to 1927 // perform a tail call. 1928 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1929 return false; 1930 TCChain = Copy->getOperand(0); 1931 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1932 return false; 1933 1934 bool HasRet = false; 1935 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1936 UI != UE; ++UI) { 1937 if (UI->getOpcode() != X86ISD::RET_FLAG) 1938 return false; 1939 HasRet = true; 1940 } 1941 1942 if (!HasRet) 1943 return false; 1944 1945 Chain = TCChain; 1946 return true; 1947} 1948 1949MVT 1950X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, 1951 ISD::NodeType ExtendKind) const { 1952 MVT ReturnMVT; 1953 // TODO: Is this also valid on 32-bit? 1954 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1955 ReturnMVT = MVT::i8; 1956 else 1957 ReturnMVT = MVT::i32; 1958 1959 MVT MinVT = getRegisterType(ReturnMVT); 1960 return VT.bitsLT(MinVT) ? MinVT : VT; 1961} 1962 1963/// LowerCallResult - Lower the result values of a call into the 1964/// appropriate copies out of appropriate physical registers. 1965/// 1966SDValue 1967X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1968 CallingConv::ID CallConv, bool isVarArg, 1969 const SmallVectorImpl<ISD::InputArg> &Ins, 1970 SDLoc dl, SelectionDAG &DAG, 1971 SmallVectorImpl<SDValue> &InVals) const { 1972 1973 // Assign locations to each value returned by this call. 1974 SmallVector<CCValAssign, 16> RVLocs; 1975 bool Is64Bit = Subtarget->is64Bit(); 1976 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1977 getTargetMachine(), RVLocs, *DAG.getContext()); 1978 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1979 1980 // Copy all of the result registers out of their specified physreg. 1981 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1982 CCValAssign &VA = RVLocs[i]; 1983 EVT CopyVT = VA.getValVT(); 1984 1985 // If this is x86-64, and we disabled SSE, we can't return FP values 1986 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1987 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1988 report_fatal_error("SSE register return with SSE disabled"); 1989 } 1990 1991 SDValue Val; 1992 1993 // If this is a call to a function that returns an fp value on the floating 1994 // point stack, we must guarantee the value is popped from the stack, so 1995 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1996 // if the return value is not used. We use the FpPOP_RETVAL instruction 1997 // instead. 1998 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1999 // If we prefer to use the value in xmm registers, copy it out as f80 and 2000 // use a truncate to move it from fp stack reg to xmm reg. 2001 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 2002 SDValue Ops[] = { Chain, InFlag }; 2003 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 2004 MVT::Other, MVT::Glue, Ops), 1); 2005 Val = Chain.getValue(0); 2006 2007 // Round the f80 to the right size, which also moves it to the appropriate 2008 // xmm register. 2009 if (CopyVT != VA.getValVT()) 2010 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2011 // This truncation won't change the value. 2012 DAG.getIntPtrConstant(1)); 2013 } else { 2014 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2015 CopyVT, InFlag).getValue(1); 2016 Val = Chain.getValue(0); 2017 } 2018 InFlag = Chain.getValue(2); 2019 InVals.push_back(Val); 2020 } 2021 2022 return Chain; 2023} 2024 2025//===----------------------------------------------------------------------===// 2026// C & StdCall & Fast Calling Convention implementation 2027//===----------------------------------------------------------------------===// 2028// StdCall calling convention seems to be standard for many Windows' API 2029// routines and around. It differs from C calling convention just a little: 2030// callee should clean up the stack, not caller. Symbols should be also 2031// decorated in some fancy way :) It doesn't support any vector arguments. 2032// For info on fast calling convention see Fast Calling Convention (tail call) 2033// implementation LowerX86_32FastCCCallTo. 2034 2035/// CallIsStructReturn - Determines whether a call uses struct return 2036/// semantics. 2037enum StructReturnType { 2038 NotStructReturn, 2039 RegStructReturn, 2040 StackStructReturn 2041}; 2042static StructReturnType 2043callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 2044 if (Outs.empty()) 2045 return NotStructReturn; 2046 2047 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2048 if (!Flags.isSRet()) 2049 return NotStructReturn; 2050 if (Flags.isInReg()) 2051 return RegStructReturn; 2052 return StackStructReturn; 2053} 2054 2055/// ArgsAreStructReturn - Determines whether a function uses struct 2056/// return semantics. 2057static StructReturnType 2058argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 2059 if (Ins.empty()) 2060 return NotStructReturn; 2061 2062 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2063 if (!Flags.isSRet()) 2064 return NotStructReturn; 2065 if (Flags.isInReg()) 2066 return RegStructReturn; 2067 return StackStructReturn; 2068} 2069 2070/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 2071/// by "Src" to address "Dst" with size and alignment information specified by 2072/// the specific parameter attribute. The copy will be passed as a byval 2073/// function parameter. 2074static SDValue 2075CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 2076 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 2077 SDLoc dl) { 2078 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 2079 2080 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2081 /*isVolatile*/false, /*AlwaysInline=*/true, 2082 MachinePointerInfo(), MachinePointerInfo()); 2083} 2084 2085/// IsTailCallConvention - Return true if the calling convention is one that 2086/// supports tail call optimization. 2087static bool IsTailCallConvention(CallingConv::ID CC) { 2088 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2089 CC == CallingConv::HiPE); 2090} 2091 2092/// \brief Return true if the calling convention is a C calling convention. 2093static bool IsCCallConvention(CallingConv::ID CC) { 2094 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || 2095 CC == CallingConv::X86_64_SysV); 2096} 2097 2098bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2099 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 2100 return false; 2101 2102 CallSite CS(CI); 2103 CallingConv::ID CalleeCC = CS.getCallingConv(); 2104 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 2105 return false; 2106 2107 return true; 2108} 2109 2110/// FuncIsMadeTailCallSafe - Return true if the function is being made into 2111/// a tailcall target by changing its ABI. 2112static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 2113 bool GuaranteedTailCallOpt) { 2114 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 2115} 2116 2117SDValue 2118X86TargetLowering::LowerMemArgument(SDValue Chain, 2119 CallingConv::ID CallConv, 2120 const SmallVectorImpl<ISD::InputArg> &Ins, 2121 SDLoc dl, SelectionDAG &DAG, 2122 const CCValAssign &VA, 2123 MachineFrameInfo *MFI, 2124 unsigned i) const { 2125 // Create the nodes corresponding to a load from this parameter slot. 2126 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2127 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 2128 getTargetMachine().Options.GuaranteedTailCallOpt); 2129 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2130 EVT ValVT; 2131 2132 // If value is passed by pointer we have address passed instead of the value 2133 // itself. 2134 if (VA.getLocInfo() == CCValAssign::Indirect) 2135 ValVT = VA.getLocVT(); 2136 else 2137 ValVT = VA.getValVT(); 2138 2139 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2140 // changed with more analysis. 2141 // In case of tail call optimization mark all arguments mutable. Since they 2142 // could be overwritten by lowering of arguments in case of a tail call. 2143 if (Flags.isByVal()) { 2144 unsigned Bytes = Flags.getByValSize(); 2145 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2146 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2147 return DAG.getFrameIndex(FI, getPointerTy()); 2148 } else { 2149 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2150 VA.getLocMemOffset(), isImmutable); 2151 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2152 return DAG.getLoad(ValVT, dl, Chain, FIN, 2153 MachinePointerInfo::getFixedStack(FI), 2154 false, false, false, 0); 2155 } 2156} 2157 2158SDValue 2159X86TargetLowering::LowerFormalArguments(SDValue Chain, 2160 CallingConv::ID CallConv, 2161 bool isVarArg, 2162 const SmallVectorImpl<ISD::InputArg> &Ins, 2163 SDLoc dl, 2164 SelectionDAG &DAG, 2165 SmallVectorImpl<SDValue> &InVals) 2166 const { 2167 MachineFunction &MF = DAG.getMachineFunction(); 2168 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2169 2170 const Function* Fn = MF.getFunction(); 2171 if (Fn->hasExternalLinkage() && 2172 Subtarget->isTargetCygMing() && 2173 Fn->getName() == "main") 2174 FuncInfo->setForceFramePointer(true); 2175 2176 MachineFrameInfo *MFI = MF.getFrameInfo(); 2177 bool Is64Bit = Subtarget->is64Bit(); 2178 bool IsWindows = Subtarget->isTargetWindows(); 2179 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2180 2181 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2182 "Var args not supported with calling convention fastcc, ghc or hipe"); 2183 2184 // Assign locations to all of the incoming arguments. 2185 SmallVector<CCValAssign, 16> ArgLocs; 2186 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2187 ArgLocs, *DAG.getContext()); 2188 2189 // Allocate shadow area for Win64 2190 if (IsWin64) 2191 CCInfo.AllocateStack(32, 8); 2192 2193 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2194 2195 unsigned LastVal = ~0U; 2196 SDValue ArgValue; 2197 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2198 CCValAssign &VA = ArgLocs[i]; 2199 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2200 // places. 2201 assert(VA.getValNo() != LastVal && 2202 "Don't support value assigned to multiple locs yet"); 2203 (void)LastVal; 2204 LastVal = VA.getValNo(); 2205 2206 if (VA.isRegLoc()) { 2207 EVT RegVT = VA.getLocVT(); 2208 const TargetRegisterClass *RC; 2209 if (RegVT == MVT::i32) 2210 RC = &X86::GR32RegClass; 2211 else if (Is64Bit && RegVT == MVT::i64) 2212 RC = &X86::GR64RegClass; 2213 else if (RegVT == MVT::f32) 2214 RC = &X86::FR32RegClass; 2215 else if (RegVT == MVT::f64) 2216 RC = &X86::FR64RegClass; 2217 else if (RegVT.is512BitVector()) 2218 RC = &X86::VR512RegClass; 2219 else if (RegVT.is256BitVector()) 2220 RC = &X86::VR256RegClass; 2221 else if (RegVT.is128BitVector()) 2222 RC = &X86::VR128RegClass; 2223 else if (RegVT == MVT::x86mmx) 2224 RC = &X86::VR64RegClass; 2225 else if (RegVT == MVT::v8i1) 2226 RC = &X86::VK8RegClass; 2227 else if (RegVT == MVT::v16i1) 2228 RC = &X86::VK16RegClass; 2229 else 2230 llvm_unreachable("Unknown argument type!"); 2231 2232 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2233 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2234 2235 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2236 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2237 // right size. 2238 if (VA.getLocInfo() == CCValAssign::SExt) 2239 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2240 DAG.getValueType(VA.getValVT())); 2241 else if (VA.getLocInfo() == CCValAssign::ZExt) 2242 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2243 DAG.getValueType(VA.getValVT())); 2244 else if (VA.getLocInfo() == CCValAssign::BCvt) 2245 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2246 2247 if (VA.isExtInLoc()) { 2248 // Handle MMX values passed in XMM regs. 2249 if (RegVT.isVector()) 2250 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2251 else 2252 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2253 } 2254 } else { 2255 assert(VA.isMemLoc()); 2256 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2257 } 2258 2259 // If value is passed via pointer - do a load. 2260 if (VA.getLocInfo() == CCValAssign::Indirect) 2261 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2262 MachinePointerInfo(), false, false, false, 0); 2263 2264 InVals.push_back(ArgValue); 2265 } 2266 2267 // The x86-64 ABIs require that for returning structs by value we copy 2268 // the sret argument into %rax/%eax (depending on ABI) for the return. 2269 // Win32 requires us to put the sret argument to %eax as well. 2270 // Save the argument into a virtual register so that we can access it 2271 // from the return points. 2272 if (MF.getFunction()->hasStructRetAttr() && 2273 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 2274 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2275 unsigned Reg = FuncInfo->getSRetReturnReg(); 2276 if (!Reg) { 2277 MVT PtrTy = getPointerTy(); 2278 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2279 FuncInfo->setSRetReturnReg(Reg); 2280 } 2281 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 2282 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2283 } 2284 2285 unsigned StackSize = CCInfo.getNextStackOffset(); 2286 // Align stack specially for tail calls. 2287 if (FuncIsMadeTailCallSafe(CallConv, 2288 MF.getTarget().Options.GuaranteedTailCallOpt)) 2289 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2290 2291 // If the function takes variable number of arguments, make a frame index for 2292 // the start of the first vararg value... for expansion of llvm.va_start. 2293 if (isVarArg) { 2294 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2295 CallConv != CallingConv::X86_ThisCall)) { 2296 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 2297 } 2298 if (Is64Bit) { 2299 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 2300 2301 // FIXME: We should really autogenerate these arrays 2302 static const uint16_t GPR64ArgRegsWin64[] = { 2303 X86::RCX, X86::RDX, X86::R8, X86::R9 2304 }; 2305 static const uint16_t GPR64ArgRegs64Bit[] = { 2306 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2307 }; 2308 static const uint16_t XMMArgRegs64Bit[] = { 2309 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2310 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2311 }; 2312 const uint16_t *GPR64ArgRegs; 2313 unsigned NumXMMRegs = 0; 2314 2315 if (IsWin64) { 2316 // The XMM registers which might contain var arg parameters are shadowed 2317 // in their paired GPR. So we only need to save the GPR to their home 2318 // slots. 2319 TotalNumIntRegs = 4; 2320 GPR64ArgRegs = GPR64ArgRegsWin64; 2321 } else { 2322 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2323 GPR64ArgRegs = GPR64ArgRegs64Bit; 2324 2325 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2326 TotalNumXMMRegs); 2327 } 2328 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2329 TotalNumIntRegs); 2330 2331 bool NoImplicitFloatOps = Fn->getAttributes(). 2332 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 2333 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2334 "SSE register cannot be used when SSE is disabled!"); 2335 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2336 NoImplicitFloatOps) && 2337 "SSE register cannot be used when SSE is disabled!"); 2338 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2339 !Subtarget->hasSSE1()) 2340 // Kernel mode asks for SSE to be disabled, so don't push them 2341 // on the stack. 2342 TotalNumXMMRegs = 0; 2343 2344 if (IsWin64) { 2345 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2346 // Get to the caller-allocated home save location. Add 8 to account 2347 // for the return address. 2348 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2349 FuncInfo->setRegSaveFrameIndex( 2350 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2351 // Fixup to set vararg frame on shadow area (4 x i64). 2352 if (NumIntRegs < 4) 2353 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2354 } else { 2355 // For X86-64, if there are vararg parameters that are passed via 2356 // registers, then we must store them to their spots on the stack so 2357 // they may be loaded by deferencing the result of va_next. 2358 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2359 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2360 FuncInfo->setRegSaveFrameIndex( 2361 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2362 false)); 2363 } 2364 2365 // Store the integer parameter registers. 2366 SmallVector<SDValue, 8> MemOps; 2367 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2368 getPointerTy()); 2369 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2370 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2371 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2372 DAG.getIntPtrConstant(Offset)); 2373 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2374 &X86::GR64RegClass); 2375 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2376 SDValue Store = 2377 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2378 MachinePointerInfo::getFixedStack( 2379 FuncInfo->getRegSaveFrameIndex(), Offset), 2380 false, false, 0); 2381 MemOps.push_back(Store); 2382 Offset += 8; 2383 } 2384 2385 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2386 // Now store the XMM (fp + vector) parameter registers. 2387 SmallVector<SDValue, 11> SaveXMMOps; 2388 SaveXMMOps.push_back(Chain); 2389 2390 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2391 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2392 SaveXMMOps.push_back(ALVal); 2393 2394 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2395 FuncInfo->getRegSaveFrameIndex())); 2396 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2397 FuncInfo->getVarArgsFPOffset())); 2398 2399 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2400 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2401 &X86::VR128RegClass); 2402 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2403 SaveXMMOps.push_back(Val); 2404 } 2405 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2406 MVT::Other, 2407 &SaveXMMOps[0], SaveXMMOps.size())); 2408 } 2409 2410 if (!MemOps.empty()) 2411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2412 &MemOps[0], MemOps.size()); 2413 } 2414 } 2415 2416 // Some CCs need callee pop. 2417 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2418 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2419 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2420 } else { 2421 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2422 // If this is an sret function, the return should pop the hidden pointer. 2423 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2424 argsAreStructReturn(Ins) == StackStructReturn) 2425 FuncInfo->setBytesToPopOnReturn(4); 2426 } 2427 2428 if (!Is64Bit) { 2429 // RegSaveFrameIndex is X86-64 only. 2430 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2431 if (CallConv == CallingConv::X86_FastCall || 2432 CallConv == CallingConv::X86_ThisCall) 2433 // fastcc functions can't have varargs. 2434 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2435 } 2436 2437 FuncInfo->setArgumentStackSize(StackSize); 2438 2439 return Chain; 2440} 2441 2442SDValue 2443X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2444 SDValue StackPtr, SDValue Arg, 2445 SDLoc dl, SelectionDAG &DAG, 2446 const CCValAssign &VA, 2447 ISD::ArgFlagsTy Flags) const { 2448 unsigned LocMemOffset = VA.getLocMemOffset(); 2449 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2450 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2451 if (Flags.isByVal()) 2452 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2453 2454 return DAG.getStore(Chain, dl, Arg, PtrOff, 2455 MachinePointerInfo::getStack(LocMemOffset), 2456 false, false, 0); 2457} 2458 2459/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2460/// optimization is performed and it is required. 2461SDValue 2462X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2463 SDValue &OutRetAddr, SDValue Chain, 2464 bool IsTailCall, bool Is64Bit, 2465 int FPDiff, SDLoc dl) const { 2466 // Adjust the Return address stack slot. 2467 EVT VT = getPointerTy(); 2468 OutRetAddr = getReturnAddressFrameIndex(DAG); 2469 2470 // Load the "old" Return address. 2471 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2472 false, false, false, 0); 2473 return SDValue(OutRetAddr.getNode(), 1); 2474} 2475 2476/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2477/// optimization is performed and it is required (FPDiff!=0). 2478static SDValue 2479EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2480 SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, 2481 unsigned SlotSize, int FPDiff, SDLoc dl) { 2482 // Store the return address to the appropriate stack slot. 2483 if (!FPDiff) return Chain; 2484 // Calculate the new stack slot for the return address. 2485 int NewReturnAddrFI = 2486 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 2487 false); 2488 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2489 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2490 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2491 false, false, 0); 2492 return Chain; 2493} 2494 2495SDValue 2496X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2497 SmallVectorImpl<SDValue> &InVals) const { 2498 SelectionDAG &DAG = CLI.DAG; 2499 SDLoc &dl = CLI.DL; 2500 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2501 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2502 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2503 SDValue Chain = CLI.Chain; 2504 SDValue Callee = CLI.Callee; 2505 CallingConv::ID CallConv = CLI.CallConv; 2506 bool &isTailCall = CLI.IsTailCall; 2507 bool isVarArg = CLI.IsVarArg; 2508 2509 MachineFunction &MF = DAG.getMachineFunction(); 2510 bool Is64Bit = Subtarget->is64Bit(); 2511 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2512 bool IsWindows = Subtarget->isTargetWindows(); 2513 StructReturnType SR = callIsStructReturn(Outs); 2514 bool IsSibcall = false; 2515 2516 if (MF.getTarget().Options.DisableTailCalls) 2517 isTailCall = false; 2518 2519 if (isTailCall) { 2520 // Check if it's really possible to do a tail call. 2521 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2522 isVarArg, SR != NotStructReturn, 2523 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2524 Outs, OutVals, Ins, DAG); 2525 2526 // Sibcalls are automatically detected tailcalls which do not require 2527 // ABI changes. 2528 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2529 IsSibcall = true; 2530 2531 if (isTailCall) 2532 ++NumTailCalls; 2533 } 2534 2535 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2536 "Var args not supported with calling convention fastcc, ghc or hipe"); 2537 2538 // Analyze operands of the call, assigning locations to each operand. 2539 SmallVector<CCValAssign, 16> ArgLocs; 2540 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2541 ArgLocs, *DAG.getContext()); 2542 2543 // Allocate shadow area for Win64 2544 if (IsWin64) 2545 CCInfo.AllocateStack(32, 8); 2546 2547 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2548 2549 // Get a count of how many bytes are to be pushed on the stack. 2550 unsigned NumBytes = CCInfo.getNextStackOffset(); 2551 if (IsSibcall) 2552 // This is a sibcall. The memory operands are available in caller's 2553 // own caller's stack. 2554 NumBytes = 0; 2555 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2556 IsTailCallConvention(CallConv)) 2557 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2558 2559 int FPDiff = 0; 2560 if (isTailCall && !IsSibcall) { 2561 // Lower arguments at fp - stackoffset + fpdiff. 2562 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2563 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2564 2565 FPDiff = NumBytesCallerPushed - NumBytes; 2566 2567 // Set the delta of movement of the returnaddr stackslot. 2568 // But only set if delta is greater than previous delta. 2569 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2570 X86Info->setTCReturnAddrDelta(FPDiff); 2571 } 2572 2573 if (!IsSibcall) 2574 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 2575 dl); 2576 2577 SDValue RetAddrFrIdx; 2578 // Load return address for tail calls. 2579 if (isTailCall && FPDiff) 2580 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2581 Is64Bit, FPDiff, dl); 2582 2583 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2584 SmallVector<SDValue, 8> MemOpChains; 2585 SDValue StackPtr; 2586 2587 // Walk the register/memloc assignments, inserting copies/loads. In the case 2588 // of tail call optimization arguments are handle later. 2589 const X86RegisterInfo *RegInfo = 2590 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 2591 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2592 CCValAssign &VA = ArgLocs[i]; 2593 EVT RegVT = VA.getLocVT(); 2594 SDValue Arg = OutVals[i]; 2595 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2596 bool isByVal = Flags.isByVal(); 2597 2598 // Promote the value if needed. 2599 switch (VA.getLocInfo()) { 2600 default: llvm_unreachable("Unknown loc info!"); 2601 case CCValAssign::Full: break; 2602 case CCValAssign::SExt: 2603 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2604 break; 2605 case CCValAssign::ZExt: 2606 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2607 break; 2608 case CCValAssign::AExt: 2609 if (RegVT.is128BitVector()) { 2610 // Special case: passing MMX values in XMM registers. 2611 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2612 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2613 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2614 } else 2615 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2616 break; 2617 case CCValAssign::BCvt: 2618 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2619 break; 2620 case CCValAssign::Indirect: { 2621 // Store the argument. 2622 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2623 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2624 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2625 MachinePointerInfo::getFixedStack(FI), 2626 false, false, 0); 2627 Arg = SpillSlot; 2628 break; 2629 } 2630 } 2631 2632 if (VA.isRegLoc()) { 2633 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2634 if (isVarArg && IsWin64) { 2635 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2636 // shadow reg if callee is a varargs function. 2637 unsigned ShadowReg = 0; 2638 switch (VA.getLocReg()) { 2639 case X86::XMM0: ShadowReg = X86::RCX; break; 2640 case X86::XMM1: ShadowReg = X86::RDX; break; 2641 case X86::XMM2: ShadowReg = X86::R8; break; 2642 case X86::XMM3: ShadowReg = X86::R9; break; 2643 } 2644 if (ShadowReg) 2645 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2646 } 2647 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2648 assert(VA.isMemLoc()); 2649 if (StackPtr.getNode() == 0) 2650 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2651 getPointerTy()); 2652 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2653 dl, DAG, VA, Flags)); 2654 } 2655 } 2656 2657 if (!MemOpChains.empty()) 2658 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2659 &MemOpChains[0], MemOpChains.size()); 2660 2661 if (Subtarget->isPICStyleGOT()) { 2662 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2663 // GOT pointer. 2664 if (!isTailCall) { 2665 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2666 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); 2667 } else { 2668 // If we are tail calling and generating PIC/GOT style code load the 2669 // address of the callee into ECX. The value in ecx is used as target of 2670 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2671 // for tail calls on PIC/GOT architectures. Normally we would just put the 2672 // address of GOT into ebx and then call target@PLT. But for tail calls 2673 // ebx would be restored (since ebx is callee saved) before jumping to the 2674 // target@PLT. 2675 2676 // Note: The actual moving to ECX is done further down. 2677 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2678 if (G && !G->getGlobal()->hasHiddenVisibility() && 2679 !G->getGlobal()->hasProtectedVisibility()) 2680 Callee = LowerGlobalAddress(Callee, DAG); 2681 else if (isa<ExternalSymbolSDNode>(Callee)) 2682 Callee = LowerExternalSymbol(Callee, DAG); 2683 } 2684 } 2685 2686 if (Is64Bit && isVarArg && !IsWin64) { 2687 // From AMD64 ABI document: 2688 // For calls that may call functions that use varargs or stdargs 2689 // (prototype-less calls or calls to functions containing ellipsis (...) in 2690 // the declaration) %al is used as hidden argument to specify the number 2691 // of SSE registers used. The contents of %al do not need to match exactly 2692 // the number of registers, but must be an ubound on the number of SSE 2693 // registers used and is in the range 0 - 8 inclusive. 2694 2695 // Count the number of XMM registers allocated. 2696 static const uint16_t XMMArgRegs[] = { 2697 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2698 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2699 }; 2700 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2701 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2702 && "SSE registers cannot be used when SSE is disabled"); 2703 2704 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2705 DAG.getConstant(NumXMMRegs, MVT::i8))); 2706 } 2707 2708 // For tail calls lower the arguments to the 'real' stack slot. 2709 if (isTailCall) { 2710 // Force all the incoming stack arguments to be loaded from the stack 2711 // before any new outgoing arguments are stored to the stack, because the 2712 // outgoing stack slots may alias the incoming argument stack slots, and 2713 // the alias isn't otherwise explicit. This is slightly more conservative 2714 // than necessary, because it means that each store effectively depends 2715 // on every argument instead of just those arguments it would clobber. 2716 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2717 2718 SmallVector<SDValue, 8> MemOpChains2; 2719 SDValue FIN; 2720 int FI = 0; 2721 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2722 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2723 CCValAssign &VA = ArgLocs[i]; 2724 if (VA.isRegLoc()) 2725 continue; 2726 assert(VA.isMemLoc()); 2727 SDValue Arg = OutVals[i]; 2728 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2729 // Create frame index. 2730 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2731 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2732 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2733 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2734 2735 if (Flags.isByVal()) { 2736 // Copy relative to framepointer. 2737 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2738 if (StackPtr.getNode() == 0) 2739 StackPtr = DAG.getCopyFromReg(Chain, dl, 2740 RegInfo->getStackRegister(), 2741 getPointerTy()); 2742 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2743 2744 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2745 ArgChain, 2746 Flags, DAG, dl)); 2747 } else { 2748 // Store relative to framepointer. 2749 MemOpChains2.push_back( 2750 DAG.getStore(ArgChain, dl, Arg, FIN, 2751 MachinePointerInfo::getFixedStack(FI), 2752 false, false, 0)); 2753 } 2754 } 2755 } 2756 2757 if (!MemOpChains2.empty()) 2758 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2759 &MemOpChains2[0], MemOpChains2.size()); 2760 2761 // Store the return address to the appropriate stack slot. 2762 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2763 getPointerTy(), RegInfo->getSlotSize(), 2764 FPDiff, dl); 2765 } 2766 2767 // Build a sequence of copy-to-reg nodes chained together with token chain 2768 // and flag operands which copy the outgoing args into registers. 2769 SDValue InFlag; 2770 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2771 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2772 RegsToPass[i].second, InFlag); 2773 InFlag = Chain.getValue(1); 2774 } 2775 2776 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2777 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2778 // In the 64-bit large code model, we have to make all calls 2779 // through a register, since the call instruction's 32-bit 2780 // pc-relative offset may not be large enough to hold the whole 2781 // address. 2782 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2783 // If the callee is a GlobalAddress node (quite common, every direct call 2784 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2785 // it. 2786 2787 // We should use extra load for direct calls to dllimported functions in 2788 // non-JIT mode. 2789 const GlobalValue *GV = G->getGlobal(); 2790 if (!GV->hasDLLImportLinkage()) { 2791 unsigned char OpFlags = 0; 2792 bool ExtraLoad = false; 2793 unsigned WrapperKind = ISD::DELETED_NODE; 2794 2795 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2796 // external symbols most go through the PLT in PIC mode. If the symbol 2797 // has hidden or protected visibility, or if it is static or local, then 2798 // we don't need to use the PLT - we can directly call it. 2799 if (Subtarget->isTargetELF() && 2800 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2801 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2802 OpFlags = X86II::MO_PLT; 2803 } else if (Subtarget->isPICStyleStubAny() && 2804 (GV->isDeclaration() || GV->isWeakForLinker()) && 2805 (!Subtarget->getTargetTriple().isMacOSX() || 2806 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2807 // PC-relative references to external symbols should go through $stub, 2808 // unless we're building with the leopard linker or later, which 2809 // automatically synthesizes these stubs. 2810 OpFlags = X86II::MO_DARWIN_STUB; 2811 } else if (Subtarget->isPICStyleRIPRel() && 2812 isa<Function>(GV) && 2813 cast<Function>(GV)->getAttributes(). 2814 hasAttribute(AttributeSet::FunctionIndex, 2815 Attribute::NonLazyBind)) { 2816 // If the function is marked as non-lazy, generate an indirect call 2817 // which loads from the GOT directly. This avoids runtime overhead 2818 // at the cost of eager binding (and one extra byte of encoding). 2819 OpFlags = X86II::MO_GOTPCREL; 2820 WrapperKind = X86ISD::WrapperRIP; 2821 ExtraLoad = true; 2822 } 2823 2824 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2825 G->getOffset(), OpFlags); 2826 2827 // Add a wrapper if needed. 2828 if (WrapperKind != ISD::DELETED_NODE) 2829 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2830 // Add extra indirection if needed. 2831 if (ExtraLoad) 2832 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2833 MachinePointerInfo::getGOT(), 2834 false, false, false, 0); 2835 } 2836 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2837 unsigned char OpFlags = 0; 2838 2839 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2840 // external symbols should go through the PLT. 2841 if (Subtarget->isTargetELF() && 2842 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2843 OpFlags = X86II::MO_PLT; 2844 } else if (Subtarget->isPICStyleStubAny() && 2845 (!Subtarget->getTargetTriple().isMacOSX() || 2846 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2847 // PC-relative references to external symbols should go through $stub, 2848 // unless we're building with the leopard linker or later, which 2849 // automatically synthesizes these stubs. 2850 OpFlags = X86II::MO_DARWIN_STUB; 2851 } 2852 2853 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2854 OpFlags); 2855 } 2856 2857 // Returns a chain & a flag for retval copy to use. 2858 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2859 SmallVector<SDValue, 8> Ops; 2860 2861 if (!IsSibcall && isTailCall) { 2862 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2863 DAG.getIntPtrConstant(0, true), InFlag, dl); 2864 InFlag = Chain.getValue(1); 2865 } 2866 2867 Ops.push_back(Chain); 2868 Ops.push_back(Callee); 2869 2870 if (isTailCall) 2871 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2872 2873 // Add argument registers to the end of the list so that they are known live 2874 // into the call. 2875 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2876 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2877 RegsToPass[i].second.getValueType())); 2878 2879 // Add a register mask operand representing the call-preserved registers. 2880 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2881 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2882 assert(Mask && "Missing call preserved mask for calling convention"); 2883 Ops.push_back(DAG.getRegisterMask(Mask)); 2884 2885 if (InFlag.getNode()) 2886 Ops.push_back(InFlag); 2887 2888 if (isTailCall) { 2889 // We used to do: 2890 //// If this is the first return lowered for this function, add the regs 2891 //// to the liveout set for the function. 2892 // This isn't right, although it's probably harmless on x86; liveouts 2893 // should be computed from returns not tail calls. Consider a void 2894 // function making a tail call to a function returning int. 2895 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 2896 } 2897 2898 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2899 InFlag = Chain.getValue(1); 2900 2901 // Create the CALLSEQ_END node. 2902 unsigned NumBytesForCalleeToPush; 2903 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2904 getTargetMachine().Options.GuaranteedTailCallOpt)) 2905 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2906 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2907 SR == StackStructReturn) 2908 // If this is a call to a struct-return function, the callee 2909 // pops the hidden struct pointer, so we have to push it back. 2910 // This is common for Darwin/X86, Linux & Mingw32 targets. 2911 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2912 NumBytesForCalleeToPush = 4; 2913 else 2914 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2915 2916 // Returns a flag for retval copy to use. 2917 if (!IsSibcall) { 2918 Chain = DAG.getCALLSEQ_END(Chain, 2919 DAG.getIntPtrConstant(NumBytes, true), 2920 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2921 true), 2922 InFlag, dl); 2923 InFlag = Chain.getValue(1); 2924 } 2925 2926 // Handle result values, copying them out of physregs into vregs that we 2927 // return. 2928 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2929 Ins, dl, DAG, InVals); 2930} 2931 2932//===----------------------------------------------------------------------===// 2933// Fast Calling Convention (tail call) implementation 2934//===----------------------------------------------------------------------===// 2935 2936// Like std call, callee cleans arguments, convention except that ECX is 2937// reserved for storing the tail called function address. Only 2 registers are 2938// free for argument passing (inreg). Tail call optimization is performed 2939// provided: 2940// * tailcallopt is enabled 2941// * caller/callee are fastcc 2942// On X86_64 architecture with GOT-style position independent code only local 2943// (within module) calls are supported at the moment. 2944// To keep the stack aligned according to platform abi the function 2945// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2946// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2947// If a tail called function callee has more arguments than the caller the 2948// caller needs to make sure that there is room to move the RETADDR to. This is 2949// achieved by reserving an area the size of the argument delta right after the 2950// original REtADDR, but before the saved framepointer or the spilled registers 2951// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2952// stack layout: 2953// arg1 2954// arg2 2955// RETADDR 2956// [ new RETADDR 2957// move area ] 2958// (possible EBP) 2959// ESI 2960// EDI 2961// local1 .. 2962 2963/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2964/// for a 16 byte align requirement. 2965unsigned 2966X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2967 SelectionDAG& DAG) const { 2968 MachineFunction &MF = DAG.getMachineFunction(); 2969 const TargetMachine &TM = MF.getTarget(); 2970 const X86RegisterInfo *RegInfo = 2971 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 2972 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2973 unsigned StackAlignment = TFI.getStackAlignment(); 2974 uint64_t AlignMask = StackAlignment - 1; 2975 int64_t Offset = StackSize; 2976 unsigned SlotSize = RegInfo->getSlotSize(); 2977 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2978 // Number smaller than 12 so just add the difference. 2979 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2980 } else { 2981 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2982 Offset = ((~AlignMask) & Offset) + StackAlignment + 2983 (StackAlignment-SlotSize); 2984 } 2985 return Offset; 2986} 2987 2988/// MatchingStackOffset - Return true if the given stack call argument is 2989/// already available in the same position (relatively) of the caller's 2990/// incoming argument stack. 2991static 2992bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2993 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2994 const X86InstrInfo *TII) { 2995 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2996 int FI = INT_MAX; 2997 if (Arg.getOpcode() == ISD::CopyFromReg) { 2998 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2999 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3000 return false; 3001 MachineInstr *Def = MRI->getVRegDef(VR); 3002 if (!Def) 3003 return false; 3004 if (!Flags.isByVal()) { 3005 if (!TII->isLoadFromStackSlot(Def, FI)) 3006 return false; 3007 } else { 3008 unsigned Opcode = Def->getOpcode(); 3009 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 3010 Def->getOperand(1).isFI()) { 3011 FI = Def->getOperand(1).getIndex(); 3012 Bytes = Flags.getByValSize(); 3013 } else 3014 return false; 3015 } 3016 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3017 if (Flags.isByVal()) 3018 // ByVal argument is passed in as a pointer but it's now being 3019 // dereferenced. e.g. 3020 // define @foo(%struct.X* %A) { 3021 // tail call @bar(%struct.X* byval %A) 3022 // } 3023 return false; 3024 SDValue Ptr = Ld->getBasePtr(); 3025 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3026 if (!FINode) 3027 return false; 3028 FI = FINode->getIndex(); 3029 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3030 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3031 FI = FINode->getIndex(); 3032 Bytes = Flags.getByValSize(); 3033 } else 3034 return false; 3035 3036 assert(FI != INT_MAX); 3037 if (!MFI->isFixedObjectIndex(FI)) 3038 return false; 3039 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 3040} 3041 3042/// IsEligibleForTailCallOptimization - Check whether the call is eligible 3043/// for tail call optimization. Targets which want to do tail call 3044/// optimization should implement this function. 3045bool 3046X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 3047 CallingConv::ID CalleeCC, 3048 bool isVarArg, 3049 bool isCalleeStructRet, 3050 bool isCallerStructRet, 3051 Type *RetTy, 3052 const SmallVectorImpl<ISD::OutputArg> &Outs, 3053 const SmallVectorImpl<SDValue> &OutVals, 3054 const SmallVectorImpl<ISD::InputArg> &Ins, 3055 SelectionDAG &DAG) const { 3056 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 3057 return false; 3058 3059 // If -tailcallopt is specified, make fastcc functions tail-callable. 3060 const MachineFunction &MF = DAG.getMachineFunction(); 3061 const Function *CallerF = MF.getFunction(); 3062 3063 // If the function return type is x86_fp80 and the callee return type is not, 3064 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3065 // perform a tailcall optimization here. 3066 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3067 return false; 3068 3069 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3070 bool CCMatch = CallerCC == CalleeCC; 3071 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 3072 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 3073 3074 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 3075 if (IsTailCallConvention(CalleeCC) && CCMatch) 3076 return true; 3077 return false; 3078 } 3079 3080 // Look for obvious safe cases to perform tail call optimization that do not 3081 // require ABI changes. This is what gcc calls sibcall. 3082 3083 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3084 // emit a special epilogue. 3085 const X86RegisterInfo *RegInfo = 3086 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3087 if (RegInfo->needsStackRealignment(MF)) 3088 return false; 3089 3090 // Also avoid sibcall optimization if either caller or callee uses struct 3091 // return semantics. 3092 if (isCalleeStructRet || isCallerStructRet) 3093 return false; 3094 3095 // An stdcall caller is expected to clean up its arguments; the callee 3096 // isn't going to do that. 3097 if (!CCMatch && CallerCC == CallingConv::X86_StdCall) 3098 return false; 3099 3100 // Do not sibcall optimize vararg calls unless all arguments are passed via 3101 // registers. 3102 if (isVarArg && !Outs.empty()) { 3103 3104 // Optimizing for varargs on Win64 is unlikely to be safe without 3105 // additional testing. 3106 if (IsCalleeWin64 || IsCallerWin64) 3107 return false; 3108 3109 SmallVector<CCValAssign, 16> ArgLocs; 3110 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3111 getTargetMachine(), ArgLocs, *DAG.getContext()); 3112 3113 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3115 if (!ArgLocs[i].isRegLoc()) 3116 return false; 3117 } 3118 3119 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3120 // stack. Therefore, if it's not used by the call it is not safe to optimize 3121 // this into a sibcall. 3122 bool Unused = false; 3123 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3124 if (!Ins[i].Used) { 3125 Unused = true; 3126 break; 3127 } 3128 } 3129 if (Unused) { 3130 SmallVector<CCValAssign, 16> RVLocs; 3131 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 3132 getTargetMachine(), RVLocs, *DAG.getContext()); 3133 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3134 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3135 CCValAssign &VA = RVLocs[i]; 3136 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 3137 return false; 3138 } 3139 } 3140 3141 // If the calling conventions do not match, then we'd better make sure the 3142 // results are returned in the same way as what the caller expects. 3143 if (!CCMatch) { 3144 SmallVector<CCValAssign, 16> RVLocs1; 3145 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 3146 getTargetMachine(), RVLocs1, *DAG.getContext()); 3147 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 3148 3149 SmallVector<CCValAssign, 16> RVLocs2; 3150 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 3151 getTargetMachine(), RVLocs2, *DAG.getContext()); 3152 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 3153 3154 if (RVLocs1.size() != RVLocs2.size()) 3155 return false; 3156 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 3157 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 3158 return false; 3159 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 3160 return false; 3161 if (RVLocs1[i].isRegLoc()) { 3162 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 3163 return false; 3164 } else { 3165 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 3166 return false; 3167 } 3168 } 3169 } 3170 3171 // If the callee takes no arguments then go on to check the results of the 3172 // call. 3173 if (!Outs.empty()) { 3174 // Check if stack adjustment is needed. For now, do not do this if any 3175 // argument is passed on the stack. 3176 SmallVector<CCValAssign, 16> ArgLocs; 3177 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3178 getTargetMachine(), ArgLocs, *DAG.getContext()); 3179 3180 // Allocate shadow area for Win64 3181 if (IsCalleeWin64) 3182 CCInfo.AllocateStack(32, 8); 3183 3184 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3185 if (CCInfo.getNextStackOffset()) { 3186 MachineFunction &MF = DAG.getMachineFunction(); 3187 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 3188 return false; 3189 3190 // Check if the arguments are already laid out in the right way as 3191 // the caller's fixed stack objects. 3192 MachineFrameInfo *MFI = MF.getFrameInfo(); 3193 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3194 const X86InstrInfo *TII = 3195 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 3196 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3197 CCValAssign &VA = ArgLocs[i]; 3198 SDValue Arg = OutVals[i]; 3199 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3200 if (VA.getLocInfo() == CCValAssign::Indirect) 3201 return false; 3202 if (!VA.isRegLoc()) { 3203 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3204 MFI, MRI, TII)) 3205 return false; 3206 } 3207 } 3208 } 3209 3210 // If the tailcall address may be in a register, then make sure it's 3211 // possible to register allocate for it. In 32-bit, the call address can 3212 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3213 // callee-saved registers are restored. These happen to be the same 3214 // registers used to pass 'inreg' arguments so watch out for those. 3215 if (!Subtarget->is64Bit() && 3216 ((!isa<GlobalAddressSDNode>(Callee) && 3217 !isa<ExternalSymbolSDNode>(Callee)) || 3218 getTargetMachine().getRelocationModel() == Reloc::PIC_)) { 3219 unsigned NumInRegs = 0; 3220 // In PIC we need an extra register to formulate the address computation 3221 // for the callee. 3222 unsigned MaxInRegs = 3223 (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 3224 3225 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3226 CCValAssign &VA = ArgLocs[i]; 3227 if (!VA.isRegLoc()) 3228 continue; 3229 unsigned Reg = VA.getLocReg(); 3230 switch (Reg) { 3231 default: break; 3232 case X86::EAX: case X86::EDX: case X86::ECX: 3233 if (++NumInRegs == MaxInRegs) 3234 return false; 3235 break; 3236 } 3237 } 3238 } 3239 } 3240 3241 return true; 3242} 3243 3244FastISel * 3245X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3246 const TargetLibraryInfo *libInfo) const { 3247 return X86::createFastISel(funcInfo, libInfo); 3248} 3249 3250//===----------------------------------------------------------------------===// 3251// Other Lowering Hooks 3252//===----------------------------------------------------------------------===// 3253 3254static bool MayFoldLoad(SDValue Op) { 3255 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3256} 3257 3258static bool MayFoldIntoStore(SDValue Op) { 3259 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3260} 3261 3262static bool isTargetShuffle(unsigned Opcode) { 3263 switch(Opcode) { 3264 default: return false; 3265 case X86ISD::PSHUFD: 3266 case X86ISD::PSHUFHW: 3267 case X86ISD::PSHUFLW: 3268 case X86ISD::SHUFP: 3269 case X86ISD::PALIGNR: 3270 case X86ISD::MOVLHPS: 3271 case X86ISD::MOVLHPD: 3272 case X86ISD::MOVHLPS: 3273 case X86ISD::MOVLPS: 3274 case X86ISD::MOVLPD: 3275 case X86ISD::MOVSHDUP: 3276 case X86ISD::MOVSLDUP: 3277 case X86ISD::MOVDDUP: 3278 case X86ISD::MOVSS: 3279 case X86ISD::MOVSD: 3280 case X86ISD::UNPCKL: 3281 case X86ISD::UNPCKH: 3282 case X86ISD::VPERMILP: 3283 case X86ISD::VPERM2X128: 3284 case X86ISD::VPERMI: 3285 return true; 3286 } 3287} 3288 3289static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3290 SDValue V1, SelectionDAG &DAG) { 3291 switch(Opc) { 3292 default: llvm_unreachable("Unknown x86 shuffle node"); 3293 case X86ISD::MOVSHDUP: 3294 case X86ISD::MOVSLDUP: 3295 case X86ISD::MOVDDUP: 3296 return DAG.getNode(Opc, dl, VT, V1); 3297 } 3298} 3299 3300static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3301 SDValue V1, unsigned TargetMask, 3302 SelectionDAG &DAG) { 3303 switch(Opc) { 3304 default: llvm_unreachable("Unknown x86 shuffle node"); 3305 case X86ISD::PSHUFD: 3306 case X86ISD::PSHUFHW: 3307 case X86ISD::PSHUFLW: 3308 case X86ISD::VPERMILP: 3309 case X86ISD::VPERMI: 3310 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3311 } 3312} 3313 3314static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3315 SDValue V1, SDValue V2, unsigned TargetMask, 3316 SelectionDAG &DAG) { 3317 switch(Opc) { 3318 default: llvm_unreachable("Unknown x86 shuffle node"); 3319 case X86ISD::PALIGNR: 3320 case X86ISD::SHUFP: 3321 case X86ISD::VPERM2X128: 3322 return DAG.getNode(Opc, dl, VT, V1, V2, 3323 DAG.getConstant(TargetMask, MVT::i8)); 3324 } 3325} 3326 3327static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3328 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3329 switch(Opc) { 3330 default: llvm_unreachable("Unknown x86 shuffle node"); 3331 case X86ISD::MOVLHPS: 3332 case X86ISD::MOVLHPD: 3333 case X86ISD::MOVHLPS: 3334 case X86ISD::MOVLPS: 3335 case X86ISD::MOVLPD: 3336 case X86ISD::MOVSS: 3337 case X86ISD::MOVSD: 3338 case X86ISD::UNPCKL: 3339 case X86ISD::UNPCKH: 3340 return DAG.getNode(Opc, dl, VT, V1, V2); 3341 } 3342} 3343 3344SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3345 MachineFunction &MF = DAG.getMachineFunction(); 3346 const X86RegisterInfo *RegInfo = 3347 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3348 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3349 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3350 3351 if (ReturnAddrIndex == 0) { 3352 // Set up a frame object for the return address. 3353 unsigned SlotSize = RegInfo->getSlotSize(); 3354 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3355 -(int64_t)SlotSize, 3356 false); 3357 FuncInfo->setRAIndex(ReturnAddrIndex); 3358 } 3359 3360 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3361} 3362 3363bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3364 bool hasSymbolicDisplacement) { 3365 // Offset should fit into 32 bit immediate field. 3366 if (!isInt<32>(Offset)) 3367 return false; 3368 3369 // If we don't have a symbolic displacement - we don't have any extra 3370 // restrictions. 3371 if (!hasSymbolicDisplacement) 3372 return true; 3373 3374 // FIXME: Some tweaks might be needed for medium code model. 3375 if (M != CodeModel::Small && M != CodeModel::Kernel) 3376 return false; 3377 3378 // For small code model we assume that latest object is 16MB before end of 31 3379 // bits boundary. We may also accept pretty large negative constants knowing 3380 // that all objects are in the positive half of address space. 3381 if (M == CodeModel::Small && Offset < 16*1024*1024) 3382 return true; 3383 3384 // For kernel code model we know that all object resist in the negative half 3385 // of 32bits address space. We may not accept negative offsets, since they may 3386 // be just off and we may accept pretty large positive ones. 3387 if (M == CodeModel::Kernel && Offset > 0) 3388 return true; 3389 3390 return false; 3391} 3392 3393/// isCalleePop - Determines whether the callee is required to pop its 3394/// own arguments. Callee pop is necessary to support tail calls. 3395bool X86::isCalleePop(CallingConv::ID CallingConv, 3396 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3397 if (IsVarArg) 3398 return false; 3399 3400 switch (CallingConv) { 3401 default: 3402 return false; 3403 case CallingConv::X86_StdCall: 3404 return !is64Bit; 3405 case CallingConv::X86_FastCall: 3406 return !is64Bit; 3407 case CallingConv::X86_ThisCall: 3408 return !is64Bit; 3409 case CallingConv::Fast: 3410 return TailCallOpt; 3411 case CallingConv::GHC: 3412 return TailCallOpt; 3413 case CallingConv::HiPE: 3414 return TailCallOpt; 3415 } 3416} 3417 3418/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3419/// specific condition code, returning the condition code and the LHS/RHS of the 3420/// comparison to make. 3421static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3422 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3423 if (!isFP) { 3424 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3425 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3426 // X > -1 -> X == 0, jump !sign. 3427 RHS = DAG.getConstant(0, RHS.getValueType()); 3428 return X86::COND_NS; 3429 } 3430 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3431 // X < 0 -> X == 0, jump on sign. 3432 return X86::COND_S; 3433 } 3434 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3435 // X < 1 -> X <= 0 3436 RHS = DAG.getConstant(0, RHS.getValueType()); 3437 return X86::COND_LE; 3438 } 3439 } 3440 3441 switch (SetCCOpcode) { 3442 default: llvm_unreachable("Invalid integer condition!"); 3443 case ISD::SETEQ: return X86::COND_E; 3444 case ISD::SETGT: return X86::COND_G; 3445 case ISD::SETGE: return X86::COND_GE; 3446 case ISD::SETLT: return X86::COND_L; 3447 case ISD::SETLE: return X86::COND_LE; 3448 case ISD::SETNE: return X86::COND_NE; 3449 case ISD::SETULT: return X86::COND_B; 3450 case ISD::SETUGT: return X86::COND_A; 3451 case ISD::SETULE: return X86::COND_BE; 3452 case ISD::SETUGE: return X86::COND_AE; 3453 } 3454 } 3455 3456 // First determine if it is required or is profitable to flip the operands. 3457 3458 // If LHS is a foldable load, but RHS is not, flip the condition. 3459 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3460 !ISD::isNON_EXTLoad(RHS.getNode())) { 3461 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3462 std::swap(LHS, RHS); 3463 } 3464 3465 switch (SetCCOpcode) { 3466 default: break; 3467 case ISD::SETOLT: 3468 case ISD::SETOLE: 3469 case ISD::SETUGT: 3470 case ISD::SETUGE: 3471 std::swap(LHS, RHS); 3472 break; 3473 } 3474 3475 // On a floating point condition, the flags are set as follows: 3476 // ZF PF CF op 3477 // 0 | 0 | 0 | X > Y 3478 // 0 | 0 | 1 | X < Y 3479 // 1 | 0 | 0 | X == Y 3480 // 1 | 1 | 1 | unordered 3481 switch (SetCCOpcode) { 3482 default: llvm_unreachable("Condcode should be pre-legalized away"); 3483 case ISD::SETUEQ: 3484 case ISD::SETEQ: return X86::COND_E; 3485 case ISD::SETOLT: // flipped 3486 case ISD::SETOGT: 3487 case ISD::SETGT: return X86::COND_A; 3488 case ISD::SETOLE: // flipped 3489 case ISD::SETOGE: 3490 case ISD::SETGE: return X86::COND_AE; 3491 case ISD::SETUGT: // flipped 3492 case ISD::SETULT: 3493 case ISD::SETLT: return X86::COND_B; 3494 case ISD::SETUGE: // flipped 3495 case ISD::SETULE: 3496 case ISD::SETLE: return X86::COND_BE; 3497 case ISD::SETONE: 3498 case ISD::SETNE: return X86::COND_NE; 3499 case ISD::SETUO: return X86::COND_P; 3500 case ISD::SETO: return X86::COND_NP; 3501 case ISD::SETOEQ: 3502 case ISD::SETUNE: return X86::COND_INVALID; 3503 } 3504} 3505 3506/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3507/// code. Current x86 isa includes the following FP cmov instructions: 3508/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3509static bool hasFPCMov(unsigned X86CC) { 3510 switch (X86CC) { 3511 default: 3512 return false; 3513 case X86::COND_B: 3514 case X86::COND_BE: 3515 case X86::COND_E: 3516 case X86::COND_P: 3517 case X86::COND_A: 3518 case X86::COND_AE: 3519 case X86::COND_NE: 3520 case X86::COND_NP: 3521 return true; 3522 } 3523} 3524 3525/// isFPImmLegal - Returns true if the target can instruction select the 3526/// specified FP immediate natively. If false, the legalizer will 3527/// materialize the FP immediate as a load from a constant pool. 3528bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3529 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3530 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3531 return true; 3532 } 3533 return false; 3534} 3535 3536/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3537/// the specified range (L, H]. 3538static bool isUndefOrInRange(int Val, int Low, int Hi) { 3539 return (Val < 0) || (Val >= Low && Val < Hi); 3540} 3541 3542/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3543/// specified value. 3544static bool isUndefOrEqual(int Val, int CmpVal) { 3545 return (Val < 0 || Val == CmpVal); 3546} 3547 3548/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3549/// from position Pos and ending in Pos+Size, falls within the specified 3550/// sequential range (L, L+Pos]. or is undef. 3551static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3552 unsigned Pos, unsigned Size, int Low) { 3553 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3554 if (!isUndefOrEqual(Mask[i], Low)) 3555 return false; 3556 return true; 3557} 3558 3559/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3560/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3561/// the second operand. 3562static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { 3563 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3564 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3565 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3566 return (Mask[0] < 2 && Mask[1] < 2); 3567 return false; 3568} 3569 3570/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3571/// is suitable for input to PSHUFHW. 3572static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3573 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3574 return false; 3575 3576 // Lower quadword copied in order or undef. 3577 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3578 return false; 3579 3580 // Upper quadword shuffled. 3581 for (unsigned i = 4; i != 8; ++i) 3582 if (!isUndefOrInRange(Mask[i], 4, 8)) 3583 return false; 3584 3585 if (VT == MVT::v16i16) { 3586 // Lower quadword copied in order or undef. 3587 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3588 return false; 3589 3590 // Upper quadword shuffled. 3591 for (unsigned i = 12; i != 16; ++i) 3592 if (!isUndefOrInRange(Mask[i], 12, 16)) 3593 return false; 3594 } 3595 3596 return true; 3597} 3598 3599/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3600/// is suitable for input to PSHUFLW. 3601static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3602 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3603 return false; 3604 3605 // Upper quadword copied in order. 3606 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3607 return false; 3608 3609 // Lower quadword shuffled. 3610 for (unsigned i = 0; i != 4; ++i) 3611 if (!isUndefOrInRange(Mask[i], 0, 4)) 3612 return false; 3613 3614 if (VT == MVT::v16i16) { 3615 // Upper quadword copied in order. 3616 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3617 return false; 3618 3619 // Lower quadword shuffled. 3620 for (unsigned i = 8; i != 12; ++i) 3621 if (!isUndefOrInRange(Mask[i], 8, 12)) 3622 return false; 3623 } 3624 3625 return true; 3626} 3627 3628/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3629/// is suitable for input to PALIGNR. 3630static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, 3631 const X86Subtarget *Subtarget) { 3632 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || 3633 (VT.is256BitVector() && !Subtarget->hasInt256())) 3634 return false; 3635 3636 unsigned NumElts = VT.getVectorNumElements(); 3637 unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; 3638 unsigned NumLaneElts = NumElts/NumLanes; 3639 3640 // Do not handle 64-bit element shuffles with palignr. 3641 if (NumLaneElts == 2) 3642 return false; 3643 3644 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3645 unsigned i; 3646 for (i = 0; i != NumLaneElts; ++i) { 3647 if (Mask[i+l] >= 0) 3648 break; 3649 } 3650 3651 // Lane is all undef, go to next lane 3652 if (i == NumLaneElts) 3653 continue; 3654 3655 int Start = Mask[i+l]; 3656 3657 // Make sure its in this lane in one of the sources 3658 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3659 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3660 return false; 3661 3662 // If not lane 0, then we must match lane 0 3663 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3664 return false; 3665 3666 // Correct second source to be contiguous with first source 3667 if (Start >= (int)NumElts) 3668 Start -= NumElts - NumLaneElts; 3669 3670 // Make sure we're shifting in the right direction. 3671 if (Start <= (int)(i+l)) 3672 return false; 3673 3674 Start -= i; 3675 3676 // Check the rest of the elements to see if they are consecutive. 3677 for (++i; i != NumLaneElts; ++i) { 3678 int Idx = Mask[i+l]; 3679 3680 // Make sure its in this lane 3681 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3682 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3683 return false; 3684 3685 // If not lane 0, then we must match lane 0 3686 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3687 return false; 3688 3689 if (Idx >= (int)NumElts) 3690 Idx -= NumElts - NumLaneElts; 3691 3692 if (!isUndefOrEqual(Idx, Start+i)) 3693 return false; 3694 3695 } 3696 } 3697 3698 return true; 3699} 3700 3701/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3702/// the two vector operands have swapped position. 3703static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3704 unsigned NumElems) { 3705 for (unsigned i = 0; i != NumElems; ++i) { 3706 int idx = Mask[i]; 3707 if (idx < 0) 3708 continue; 3709 else if (idx < (int)NumElems) 3710 Mask[i] = idx + NumElems; 3711 else 3712 Mask[i] = idx - NumElems; 3713 } 3714} 3715 3716/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3717/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3718/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3719/// reverse of what x86 shuffles want. 3720static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { 3721 3722 unsigned NumElems = VT.getVectorNumElements(); 3723 unsigned NumLanes = VT.getSizeInBits()/128; 3724 unsigned NumLaneElems = NumElems/NumLanes; 3725 3726 if (NumLaneElems != 2 && NumLaneElems != 4) 3727 return false; 3728 3729 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 3730 bool symetricMaskRequired = 3731 (VT.getSizeInBits() >= 256) && (EltSize == 32); 3732 3733 // VSHUFPSY divides the resulting vector into 4 chunks. 3734 // The sources are also splitted into 4 chunks, and each destination 3735 // chunk must come from a different source chunk. 3736 // 3737 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3738 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3739 // 3740 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3741 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3742 // 3743 // VSHUFPDY divides the resulting vector into 4 chunks. 3744 // The sources are also splitted into 4 chunks, and each destination 3745 // chunk must come from a different source chunk. 3746 // 3747 // SRC1 => X3 X2 X1 X0 3748 // SRC2 => Y3 Y2 Y1 Y0 3749 // 3750 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3751 // 3752 SmallVector<int, 4> MaskVal(NumLaneElems, -1); 3753 unsigned HalfLaneElems = NumLaneElems/2; 3754 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3755 for (unsigned i = 0; i != NumLaneElems; ++i) { 3756 int Idx = Mask[i+l]; 3757 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3758 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3759 return false; 3760 // For VSHUFPSY, the mask of the second half must be the same as the 3761 // first but with the appropriate offsets. This works in the same way as 3762 // VPERMILPS works with masks. 3763 if (!symetricMaskRequired || Idx < 0) 3764 continue; 3765 if (MaskVal[i] < 0) { 3766 MaskVal[i] = Idx - l; 3767 continue; 3768 } 3769 if ((signed)(Idx - l) != MaskVal[i]) 3770 return false; 3771 } 3772 } 3773 3774 return true; 3775} 3776 3777/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3778/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3779static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { 3780 if (!VT.is128BitVector()) 3781 return false; 3782 3783 unsigned NumElems = VT.getVectorNumElements(); 3784 3785 if (NumElems != 4) 3786 return false; 3787 3788 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3789 return isUndefOrEqual(Mask[0], 6) && 3790 isUndefOrEqual(Mask[1], 7) && 3791 isUndefOrEqual(Mask[2], 2) && 3792 isUndefOrEqual(Mask[3], 3); 3793} 3794 3795/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3796/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3797/// <2, 3, 2, 3> 3798static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { 3799 if (!VT.is128BitVector()) 3800 return false; 3801 3802 unsigned NumElems = VT.getVectorNumElements(); 3803 3804 if (NumElems != 4) 3805 return false; 3806 3807 return isUndefOrEqual(Mask[0], 2) && 3808 isUndefOrEqual(Mask[1], 3) && 3809 isUndefOrEqual(Mask[2], 2) && 3810 isUndefOrEqual(Mask[3], 3); 3811} 3812 3813/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3814/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3815static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { 3816 if (!VT.is128BitVector()) 3817 return false; 3818 3819 unsigned NumElems = VT.getVectorNumElements(); 3820 3821 if (NumElems != 2 && NumElems != 4) 3822 return false; 3823 3824 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3825 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3826 return false; 3827 3828 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3829 if (!isUndefOrEqual(Mask[i], i)) 3830 return false; 3831 3832 return true; 3833} 3834 3835/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3836/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3837static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { 3838 if (!VT.is128BitVector()) 3839 return false; 3840 3841 unsigned NumElems = VT.getVectorNumElements(); 3842 3843 if (NumElems != 2 && NumElems != 4) 3844 return false; 3845 3846 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3847 if (!isUndefOrEqual(Mask[i], i)) 3848 return false; 3849 3850 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3851 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3852 return false; 3853 3854 return true; 3855} 3856 3857// 3858// Some special combinations that can be optimized. 3859// 3860static 3861SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3862 SelectionDAG &DAG) { 3863 MVT VT = SVOp->getSimpleValueType(0); 3864 SDLoc dl(SVOp); 3865 3866 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3867 return SDValue(); 3868 3869 ArrayRef<int> Mask = SVOp->getMask(); 3870 3871 // These are the special masks that may be optimized. 3872 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3873 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3874 bool MatchEvenMask = true; 3875 bool MatchOddMask = true; 3876 for (int i=0; i<8; ++i) { 3877 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3878 MatchEvenMask = false; 3879 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3880 MatchOddMask = false; 3881 } 3882 3883 if (!MatchEvenMask && !MatchOddMask) 3884 return SDValue(); 3885 3886 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3887 3888 SDValue Op0 = SVOp->getOperand(0); 3889 SDValue Op1 = SVOp->getOperand(1); 3890 3891 if (MatchEvenMask) { 3892 // Shift the second operand right to 32 bits. 3893 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3894 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3895 } else { 3896 // Shift the first operand left to 32 bits. 3897 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3898 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3899 } 3900 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3901 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3902} 3903 3904/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3905/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3906static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, 3907 bool HasInt256, bool V2IsSplat = false) { 3908 3909 assert(VT.getSizeInBits() >= 128 && 3910 "Unsupported vector type for unpckl"); 3911 3912 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3913 unsigned NumLanes; 3914 unsigned NumOf256BitLanes; 3915 unsigned NumElts = VT.getVectorNumElements(); 3916 if (VT.is256BitVector()) { 3917 if (NumElts != 4 && NumElts != 8 && 3918 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3919 return false; 3920 NumLanes = 2; 3921 NumOf256BitLanes = 1; 3922 } else if (VT.is512BitVector()) { 3923 assert(VT.getScalarType().getSizeInBits() >= 32 && 3924 "Unsupported vector type for unpckh"); 3925 NumLanes = 2; 3926 NumOf256BitLanes = 2; 3927 } else { 3928 NumLanes = 1; 3929 NumOf256BitLanes = 1; 3930 } 3931 3932 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3933 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3934 3935 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3936 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3937 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 3938 int BitI = Mask[l256*NumEltsInStride+l+i]; 3939 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3940 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3941 return false; 3942 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3943 return false; 3944 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3945 return false; 3946 } 3947 } 3948 } 3949 return true; 3950} 3951 3952/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3953/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3954static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, 3955 bool HasInt256, bool V2IsSplat = false) { 3956 assert(VT.getSizeInBits() >= 128 && 3957 "Unsupported vector type for unpckh"); 3958 3959 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3960 unsigned NumLanes; 3961 unsigned NumOf256BitLanes; 3962 unsigned NumElts = VT.getVectorNumElements(); 3963 if (VT.is256BitVector()) { 3964 if (NumElts != 4 && NumElts != 8 && 3965 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3966 return false; 3967 NumLanes = 2; 3968 NumOf256BitLanes = 1; 3969 } else if (VT.is512BitVector()) { 3970 assert(VT.getScalarType().getSizeInBits() >= 32 && 3971 "Unsupported vector type for unpckh"); 3972 NumLanes = 2; 3973 NumOf256BitLanes = 2; 3974 } else { 3975 NumLanes = 1; 3976 NumOf256BitLanes = 1; 3977 } 3978 3979 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3980 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3981 3982 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3983 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3984 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 3985 int BitI = Mask[l256*NumEltsInStride+l+i]; 3986 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3987 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3988 return false; 3989 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3990 return false; 3991 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3992 return false; 3993 } 3994 } 3995 } 3996 return true; 3997} 3998 3999/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 4000/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 4001/// <0, 0, 1, 1> 4002static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4003 unsigned NumElts = VT.getVectorNumElements(); 4004 bool Is256BitVec = VT.is256BitVector(); 4005 4006 if (VT.is512BitVector()) 4007 return false; 4008 assert((VT.is128BitVector() || VT.is256BitVector()) && 4009 "Unsupported vector type for unpckh"); 4010 4011 if (Is256BitVec && NumElts != 4 && NumElts != 8 && 4012 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4013 return false; 4014 4015 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 4016 // FIXME: Need a better way to get rid of this, there's no latency difference 4017 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 4018 // the former later. We should also remove the "_undef" special mask. 4019 if (NumElts == 4 && Is256BitVec) 4020 return false; 4021 4022 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4023 // independently on 128-bit lanes. 4024 unsigned NumLanes = VT.getSizeInBits()/128; 4025 unsigned NumLaneElts = NumElts/NumLanes; 4026 4027 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4028 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 4029 int BitI = Mask[l+i]; 4030 int BitI1 = Mask[l+i+1]; 4031 4032 if (!isUndefOrEqual(BitI, j)) 4033 return false; 4034 if (!isUndefOrEqual(BitI1, j)) 4035 return false; 4036 } 4037 } 4038 4039 return true; 4040} 4041 4042/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 4043/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 4044/// <2, 2, 3, 3> 4045static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4046 unsigned NumElts = VT.getVectorNumElements(); 4047 4048 if (VT.is512BitVector()) 4049 return false; 4050 4051 assert((VT.is128BitVector() || VT.is256BitVector()) && 4052 "Unsupported vector type for unpckh"); 4053 4054 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 4055 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4056 return false; 4057 4058 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4059 // independently on 128-bit lanes. 4060 unsigned NumLanes = VT.getSizeInBits()/128; 4061 unsigned NumLaneElts = NumElts/NumLanes; 4062 4063 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4064 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 4065 int BitI = Mask[l+i]; 4066 int BitI1 = Mask[l+i+1]; 4067 if (!isUndefOrEqual(BitI, j)) 4068 return false; 4069 if (!isUndefOrEqual(BitI1, j)) 4070 return false; 4071 } 4072 } 4073 return true; 4074} 4075 4076/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 4077/// specifies a shuffle of elements that is suitable for input to MOVSS, 4078/// MOVSD, and MOVD, i.e. setting the lowest element. 4079static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 4080 if (VT.getVectorElementType().getSizeInBits() < 32) 4081 return false; 4082 if (!VT.is128BitVector()) 4083 return false; 4084 4085 unsigned NumElts = VT.getVectorNumElements(); 4086 4087 if (!isUndefOrEqual(Mask[0], NumElts)) 4088 return false; 4089 4090 for (unsigned i = 1; i != NumElts; ++i) 4091 if (!isUndefOrEqual(Mask[i], i)) 4092 return false; 4093 4094 return true; 4095} 4096 4097/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 4098/// as permutations between 128-bit chunks or halves. As an example: this 4099/// shuffle bellow: 4100/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 4101/// The first half comes from the second half of V1 and the second half from the 4102/// the second half of V2. 4103static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4104 if (!HasFp256 || !VT.is256BitVector()) 4105 return false; 4106 4107 // The shuffle result is divided into half A and half B. In total the two 4108 // sources have 4 halves, namely: C, D, E, F. The final values of A and 4109 // B must come from C, D, E or F. 4110 unsigned HalfSize = VT.getVectorNumElements()/2; 4111 bool MatchA = false, MatchB = false; 4112 4113 // Check if A comes from one of C, D, E, F. 4114 for (unsigned Half = 0; Half != 4; ++Half) { 4115 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 4116 MatchA = true; 4117 break; 4118 } 4119 } 4120 4121 // Check if B comes from one of C, D, E, F. 4122 for (unsigned Half = 0; Half != 4; ++Half) { 4123 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 4124 MatchB = true; 4125 break; 4126 } 4127 } 4128 4129 return MatchA && MatchB; 4130} 4131 4132/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 4133/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 4134static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 4135 MVT VT = SVOp->getSimpleValueType(0); 4136 4137 unsigned HalfSize = VT.getVectorNumElements()/2; 4138 4139 unsigned FstHalf = 0, SndHalf = 0; 4140 for (unsigned i = 0; i < HalfSize; ++i) { 4141 if (SVOp->getMaskElt(i) > 0) { 4142 FstHalf = SVOp->getMaskElt(i)/HalfSize; 4143 break; 4144 } 4145 } 4146 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 4147 if (SVOp->getMaskElt(i) > 0) { 4148 SndHalf = SVOp->getMaskElt(i)/HalfSize; 4149 break; 4150 } 4151 } 4152 4153 return (FstHalf | (SndHalf << 4)); 4154} 4155 4156// Symetric in-lane mask. Each lane has 4 elements (for imm8) 4157static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { 4158 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4159 if (EltSize < 32) 4160 return false; 4161 4162 unsigned NumElts = VT.getVectorNumElements(); 4163 Imm8 = 0; 4164 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { 4165 for (unsigned i = 0; i != NumElts; ++i) { 4166 if (Mask[i] < 0) 4167 continue; 4168 Imm8 |= Mask[i] << (i*2); 4169 } 4170 return true; 4171 } 4172 4173 unsigned LaneSize = 4; 4174 SmallVector<int, 4> MaskVal(LaneSize, -1); 4175 4176 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4177 for (unsigned i = 0; i != LaneSize; ++i) { 4178 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4179 return false; 4180 if (Mask[i+l] < 0) 4181 continue; 4182 if (MaskVal[i] < 0) { 4183 MaskVal[i] = Mask[i+l] - l; 4184 Imm8 |= MaskVal[i] << (i*2); 4185 continue; 4186 } 4187 if (Mask[i+l] != (signed)(MaskVal[i]+l)) 4188 return false; 4189 } 4190 } 4191 return true; 4192} 4193 4194/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 4195/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 4196/// Note that VPERMIL mask matching is different depending whether theunderlying 4197/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 4198/// to the same elements of the low, but to the higher half of the source. 4199/// In VPERMILPD the two lanes could be shuffled independently of each other 4200/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 4201static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { 4202 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4203 if (VT.getSizeInBits() < 256 || EltSize < 32) 4204 return false; 4205 bool symetricMaskRequired = (EltSize == 32); 4206 unsigned NumElts = VT.getVectorNumElements(); 4207 4208 unsigned NumLanes = VT.getSizeInBits()/128; 4209 unsigned LaneSize = NumElts/NumLanes; 4210 // 2 or 4 elements in one lane 4211 4212 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); 4213 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4214 for (unsigned i = 0; i != LaneSize; ++i) { 4215 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4216 return false; 4217 if (symetricMaskRequired) { 4218 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { 4219 ExpectedMaskVal[i] = Mask[i+l] - l; 4220 continue; 4221 } 4222 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) 4223 return false; 4224 } 4225 } 4226 } 4227 return true; 4228} 4229 4230/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 4231/// of what x86 movss want. X86 movs requires the lowest element to be lowest 4232/// element of vector 2 and the other elements to come from vector 1 in order. 4233static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, 4234 bool V2IsSplat = false, bool V2IsUndef = false) { 4235 if (!VT.is128BitVector()) 4236 return false; 4237 4238 unsigned NumOps = VT.getVectorNumElements(); 4239 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 4240 return false; 4241 4242 if (!isUndefOrEqual(Mask[0], 0)) 4243 return false; 4244 4245 for (unsigned i = 1; i != NumOps; ++i) 4246 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 4247 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 4248 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 4249 return false; 4250 4251 return true; 4252} 4253 4254/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4255/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 4256/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 4257static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, 4258 const X86Subtarget *Subtarget) { 4259 if (!Subtarget->hasSSE3()) 4260 return false; 4261 4262 unsigned NumElems = VT.getVectorNumElements(); 4263 4264 if ((VT.is128BitVector() && NumElems != 4) || 4265 (VT.is256BitVector() && NumElems != 8) || 4266 (VT.is512BitVector() && NumElems != 16)) 4267 return false; 4268 4269 // "i+1" is the value the indexed mask element must have 4270 for (unsigned i = 0; i != NumElems; i += 2) 4271 if (!isUndefOrEqual(Mask[i], i+1) || 4272 !isUndefOrEqual(Mask[i+1], i+1)) 4273 return false; 4274 4275 return true; 4276} 4277 4278/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4279/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 4280/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 4281static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, 4282 const X86Subtarget *Subtarget) { 4283 if (!Subtarget->hasSSE3()) 4284 return false; 4285 4286 unsigned NumElems = VT.getVectorNumElements(); 4287 4288 if ((VT.is128BitVector() && NumElems != 4) || 4289 (VT.is256BitVector() && NumElems != 8) || 4290 (VT.is512BitVector() && NumElems != 16)) 4291 return false; 4292 4293 // "i" is the value the indexed mask element must have 4294 for (unsigned i = 0; i != NumElems; i += 2) 4295 if (!isUndefOrEqual(Mask[i], i) || 4296 !isUndefOrEqual(Mask[i+1], i)) 4297 return false; 4298 4299 return true; 4300} 4301 4302/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 4303/// specifies a shuffle of elements that is suitable for input to 256-bit 4304/// version of MOVDDUP. 4305static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4306 if (!HasFp256 || !VT.is256BitVector()) 4307 return false; 4308 4309 unsigned NumElts = VT.getVectorNumElements(); 4310 if (NumElts != 4) 4311 return false; 4312 4313 for (unsigned i = 0; i != NumElts/2; ++i) 4314 if (!isUndefOrEqual(Mask[i], 0)) 4315 return false; 4316 for (unsigned i = NumElts/2; i != NumElts; ++i) 4317 if (!isUndefOrEqual(Mask[i], NumElts/2)) 4318 return false; 4319 return true; 4320} 4321 4322/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4323/// specifies a shuffle of elements that is suitable for input to 128-bit 4324/// version of MOVDDUP. 4325static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { 4326 if (!VT.is128BitVector()) 4327 return false; 4328 4329 unsigned e = VT.getVectorNumElements() / 2; 4330 for (unsigned i = 0; i != e; ++i) 4331 if (!isUndefOrEqual(Mask[i], i)) 4332 return false; 4333 for (unsigned i = 0; i != e; ++i) 4334 if (!isUndefOrEqual(Mask[e+i], i)) 4335 return false; 4336 return true; 4337} 4338 4339/// isVEXTRACTIndex - Return true if the specified 4340/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 4341/// suitable for instruction that extract 128 or 256 bit vectors 4342static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 4343 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4344 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4345 return false; 4346 4347 // The index should be aligned on a vecWidth-bit boundary. 4348 uint64_t Index = 4349 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4350 4351 MVT VT = N->getSimpleValueType(0); 4352 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4353 bool Result = (Index * ElSize) % vecWidth == 0; 4354 4355 return Result; 4356} 4357 4358/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR 4359/// operand specifies a subvector insert that is suitable for input to 4360/// insertion of 128 or 256-bit subvectors 4361static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 4362 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4363 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4364 return false; 4365 // The index should be aligned on a vecWidth-bit boundary. 4366 uint64_t Index = 4367 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4368 4369 MVT VT = N->getSimpleValueType(0); 4370 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4371 bool Result = (Index * ElSize) % vecWidth == 0; 4372 4373 return Result; 4374} 4375 4376bool X86::isVINSERT128Index(SDNode *N) { 4377 return isVINSERTIndex(N, 128); 4378} 4379 4380bool X86::isVINSERT256Index(SDNode *N) { 4381 return isVINSERTIndex(N, 256); 4382} 4383 4384bool X86::isVEXTRACT128Index(SDNode *N) { 4385 return isVEXTRACTIndex(N, 128); 4386} 4387 4388bool X86::isVEXTRACT256Index(SDNode *N) { 4389 return isVEXTRACTIndex(N, 256); 4390} 4391 4392/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4393/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4394/// Handles 128-bit and 256-bit. 4395static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 4396 MVT VT = N->getSimpleValueType(0); 4397 4398 assert((VT.getSizeInBits() >= 128) && 4399 "Unsupported vector type for PSHUF/SHUFP"); 4400 4401 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 4402 // independently on 128-bit lanes. 4403 unsigned NumElts = VT.getVectorNumElements(); 4404 unsigned NumLanes = VT.getSizeInBits()/128; 4405 unsigned NumLaneElts = NumElts/NumLanes; 4406 4407 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && 4408 "Only supports 2, 4 or 8 elements per lane"); 4409 4410 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; 4411 unsigned Mask = 0; 4412 for (unsigned i = 0; i != NumElts; ++i) { 4413 int Elt = N->getMaskElt(i); 4414 if (Elt < 0) continue; 4415 Elt &= NumLaneElts - 1; 4416 unsigned ShAmt = (i << Shift) % 8; 4417 Mask |= Elt << ShAmt; 4418 } 4419 4420 return Mask; 4421} 4422 4423/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4424/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4425static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4426 MVT VT = N->getSimpleValueType(0); 4427 4428 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4429 "Unsupported vector type for PSHUFHW"); 4430 4431 unsigned NumElts = VT.getVectorNumElements(); 4432 4433 unsigned Mask = 0; 4434 for (unsigned l = 0; l != NumElts; l += 8) { 4435 // 8 nodes per lane, but we only care about the last 4. 4436 for (unsigned i = 0; i < 4; ++i) { 4437 int Elt = N->getMaskElt(l+i+4); 4438 if (Elt < 0) continue; 4439 Elt &= 0x3; // only 2-bits. 4440 Mask |= Elt << (i * 2); 4441 } 4442 } 4443 4444 return Mask; 4445} 4446 4447/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4448/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4449static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4450 MVT VT = N->getSimpleValueType(0); 4451 4452 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4453 "Unsupported vector type for PSHUFHW"); 4454 4455 unsigned NumElts = VT.getVectorNumElements(); 4456 4457 unsigned Mask = 0; 4458 for (unsigned l = 0; l != NumElts; l += 8) { 4459 // 8 nodes per lane, but we only care about the first 4. 4460 for (unsigned i = 0; i < 4; ++i) { 4461 int Elt = N->getMaskElt(l+i); 4462 if (Elt < 0) continue; 4463 Elt &= 0x3; // only 2-bits 4464 Mask |= Elt << (i * 2); 4465 } 4466 } 4467 4468 return Mask; 4469} 4470 4471/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4472/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4473static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4474 MVT VT = SVOp->getSimpleValueType(0); 4475 unsigned EltSize = VT.is512BitVector() ? 1 : 4476 VT.getVectorElementType().getSizeInBits() >> 3; 4477 4478 unsigned NumElts = VT.getVectorNumElements(); 4479 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; 4480 unsigned NumLaneElts = NumElts/NumLanes; 4481 4482 int Val = 0; 4483 unsigned i; 4484 for (i = 0; i != NumElts; ++i) { 4485 Val = SVOp->getMaskElt(i); 4486 if (Val >= 0) 4487 break; 4488 } 4489 if (Val >= (int)NumElts) 4490 Val -= NumElts - NumLaneElts; 4491 4492 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4493 return (Val - i) * EltSize; 4494} 4495 4496static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 4497 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4498 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4499 llvm_unreachable("Illegal extract subvector for VEXTRACT"); 4500 4501 uint64_t Index = 4502 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4503 4504 MVT VecVT = N->getOperand(0).getSimpleValueType(); 4505 MVT ElVT = VecVT.getVectorElementType(); 4506 4507 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4508 return Index / NumElemsPerChunk; 4509} 4510 4511static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 4512 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4513 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4514 llvm_unreachable("Illegal insert subvector for VINSERT"); 4515 4516 uint64_t Index = 4517 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4518 4519 MVT VecVT = N->getSimpleValueType(0); 4520 MVT ElVT = VecVT.getVectorElementType(); 4521 4522 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4523 return Index / NumElemsPerChunk; 4524} 4525 4526/// getExtractVEXTRACT128Immediate - Return the appropriate immediate 4527/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4528/// and VINSERTI128 instructions. 4529unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 4530 return getExtractVEXTRACTImmediate(N, 128); 4531} 4532 4533/// getExtractVEXTRACT256Immediate - Return the appropriate immediate 4534/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 4535/// and VINSERTI64x4 instructions. 4536unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 4537 return getExtractVEXTRACTImmediate(N, 256); 4538} 4539 4540/// getInsertVINSERT128Immediate - Return the appropriate immediate 4541/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4542/// and VINSERTI128 instructions. 4543unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 4544 return getInsertVINSERTImmediate(N, 128); 4545} 4546 4547/// getInsertVINSERT256Immediate - Return the appropriate immediate 4548/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 4549/// and VINSERTI64x4 instructions. 4550unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 4551 return getInsertVINSERTImmediate(N, 256); 4552} 4553 4554/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4555/// constant +0.0. 4556bool X86::isZeroNode(SDValue Elt) { 4557 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) 4558 return CN->isNullValue(); 4559 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 4560 return CFP->getValueAPF().isPosZero(); 4561 return false; 4562} 4563 4564/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4565/// their permute mask. 4566static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4567 SelectionDAG &DAG) { 4568 MVT VT = SVOp->getSimpleValueType(0); 4569 unsigned NumElems = VT.getVectorNumElements(); 4570 SmallVector<int, 8> MaskVec; 4571 4572 for (unsigned i = 0; i != NumElems; ++i) { 4573 int Idx = SVOp->getMaskElt(i); 4574 if (Idx >= 0) { 4575 if (Idx < (int)NumElems) 4576 Idx += NumElems; 4577 else 4578 Idx -= NumElems; 4579 } 4580 MaskVec.push_back(Idx); 4581 } 4582 return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), 4583 SVOp->getOperand(0), &MaskVec[0]); 4584} 4585 4586/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4587/// match movhlps. The lower half elements should come from upper half of 4588/// V1 (and in order), and the upper half elements should come from the upper 4589/// half of V2 (and in order). 4590static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { 4591 if (!VT.is128BitVector()) 4592 return false; 4593 if (VT.getVectorNumElements() != 4) 4594 return false; 4595 for (unsigned i = 0, e = 2; i != e; ++i) 4596 if (!isUndefOrEqual(Mask[i], i+2)) 4597 return false; 4598 for (unsigned i = 2; i != 4; ++i) 4599 if (!isUndefOrEqual(Mask[i], i+4)) 4600 return false; 4601 return true; 4602} 4603 4604/// isScalarLoadToVector - Returns true if the node is a scalar load that 4605/// is promoted to a vector. It also returns the LoadSDNode by reference if 4606/// required. 4607static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4608 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4609 return false; 4610 N = N->getOperand(0).getNode(); 4611 if (!ISD::isNON_EXTLoad(N)) 4612 return false; 4613 if (LD) 4614 *LD = cast<LoadSDNode>(N); 4615 return true; 4616} 4617 4618// Test whether the given value is a vector value which will be legalized 4619// into a load. 4620static bool WillBeConstantPoolLoad(SDNode *N) { 4621 if (N->getOpcode() != ISD::BUILD_VECTOR) 4622 return false; 4623 4624 // Check for any non-constant elements. 4625 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4626 switch (N->getOperand(i).getNode()->getOpcode()) { 4627 case ISD::UNDEF: 4628 case ISD::ConstantFP: 4629 case ISD::Constant: 4630 break; 4631 default: 4632 return false; 4633 } 4634 4635 // Vectors of all-zeros and all-ones are materialized with special 4636 // instructions rather than being loaded. 4637 return !ISD::isBuildVectorAllZeros(N) && 4638 !ISD::isBuildVectorAllOnes(N); 4639} 4640 4641/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4642/// match movlp{s|d}. The lower half elements should come from lower half of 4643/// V1 (and in order), and the upper half elements should come from the upper 4644/// half of V2 (and in order). And since V1 will become the source of the 4645/// MOVLP, it must be either a vector load or a scalar load to vector. 4646static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4647 ArrayRef<int> Mask, MVT VT) { 4648 if (!VT.is128BitVector()) 4649 return false; 4650 4651 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4652 return false; 4653 // Is V2 is a vector load, don't do this transformation. We will try to use 4654 // load folding shufps op. 4655 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4656 return false; 4657 4658 unsigned NumElems = VT.getVectorNumElements(); 4659 4660 if (NumElems != 2 && NumElems != 4) 4661 return false; 4662 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4663 if (!isUndefOrEqual(Mask[i], i)) 4664 return false; 4665 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4666 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4667 return false; 4668 return true; 4669} 4670 4671/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4672/// all the same. 4673static bool isSplatVector(SDNode *N) { 4674 if (N->getOpcode() != ISD::BUILD_VECTOR) 4675 return false; 4676 4677 SDValue SplatValue = N->getOperand(0); 4678 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4679 if (N->getOperand(i) != SplatValue) 4680 return false; 4681 return true; 4682} 4683 4684/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4685/// to an zero vector. 4686/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4687static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4688 SDValue V1 = N->getOperand(0); 4689 SDValue V2 = N->getOperand(1); 4690 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4691 for (unsigned i = 0; i != NumElems; ++i) { 4692 int Idx = N->getMaskElt(i); 4693 if (Idx >= (int)NumElems) { 4694 unsigned Opc = V2.getOpcode(); 4695 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4696 continue; 4697 if (Opc != ISD::BUILD_VECTOR || 4698 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4699 return false; 4700 } else if (Idx >= 0) { 4701 unsigned Opc = V1.getOpcode(); 4702 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4703 continue; 4704 if (Opc != ISD::BUILD_VECTOR || 4705 !X86::isZeroNode(V1.getOperand(Idx))) 4706 return false; 4707 } 4708 } 4709 return true; 4710} 4711 4712/// getZeroVector - Returns a vector of specified type with all zero elements. 4713/// 4714static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4715 SelectionDAG &DAG, SDLoc dl) { 4716 assert(VT.isVector() && "Expected a vector type"); 4717 4718 // Always build SSE zero vectors as <4 x i32> bitcasted 4719 // to their dest type. This ensures they get CSE'd. 4720 SDValue Vec; 4721 if (VT.is128BitVector()) { // SSE 4722 if (Subtarget->hasSSE2()) { // SSE2 4723 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4724 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4725 } else { // SSE1 4726 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4727 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4728 } 4729 } else if (VT.is256BitVector()) { // AVX 4730 if (Subtarget->hasInt256()) { // AVX2 4731 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4732 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4733 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4734 array_lengthof(Ops)); 4735 } else { 4736 // 256-bit logic and arithmetic instructions in AVX are all 4737 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4738 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4739 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4740 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 4741 array_lengthof(Ops)); 4742 } 4743 } else if (VT.is512BitVector()) { // AVX-512 4744 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4745 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 4746 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4747 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); 4748 } else 4749 llvm_unreachable("Unexpected vector type"); 4750 4751 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4752} 4753 4754/// getOnesVector - Returns a vector of specified type with all bits set. 4755/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4756/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4757/// Then bitcast to their original type, ensuring they get CSE'd. 4758static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 4759 SDLoc dl) { 4760 assert(VT.isVector() && "Expected a vector type"); 4761 4762 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4763 SDValue Vec; 4764 if (VT.is256BitVector()) { 4765 if (HasInt256) { // AVX2 4766 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4767 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4768 array_lengthof(Ops)); 4769 } else { // AVX 4770 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4771 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4772 } 4773 } else if (VT.is128BitVector()) { 4774 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4775 } else 4776 llvm_unreachable("Unexpected vector type"); 4777 4778 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4779} 4780 4781/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4782/// that point to V2 points to its first element. 4783static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4784 for (unsigned i = 0; i != NumElems; ++i) { 4785 if (Mask[i] > (int)NumElems) { 4786 Mask[i] = NumElems; 4787 } 4788 } 4789} 4790 4791/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4792/// operation of specified width. 4793static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 4794 SDValue V2) { 4795 unsigned NumElems = VT.getVectorNumElements(); 4796 SmallVector<int, 8> Mask; 4797 Mask.push_back(NumElems); 4798 for (unsigned i = 1; i != NumElems; ++i) 4799 Mask.push_back(i); 4800 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4801} 4802 4803/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4804static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4805 SDValue V2) { 4806 unsigned NumElems = VT.getVectorNumElements(); 4807 SmallVector<int, 8> Mask; 4808 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4809 Mask.push_back(i); 4810 Mask.push_back(i + NumElems); 4811 } 4812 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4813} 4814 4815/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4816static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4817 SDValue V2) { 4818 unsigned NumElems = VT.getVectorNumElements(); 4819 SmallVector<int, 8> Mask; 4820 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4821 Mask.push_back(i + Half); 4822 Mask.push_back(i + NumElems + Half); 4823 } 4824 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4825} 4826 4827// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4828// a generic shuffle instruction because the target has no such instructions. 4829// Generate shuffles which repeat i16 and i8 several times until they can be 4830// represented by v4f32 and then be manipulated by target suported shuffles. 4831static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4832 MVT VT = V.getSimpleValueType(); 4833 int NumElems = VT.getVectorNumElements(); 4834 SDLoc dl(V); 4835 4836 while (NumElems > 4) { 4837 if (EltNo < NumElems/2) { 4838 V = getUnpackl(DAG, dl, VT, V, V); 4839 } else { 4840 V = getUnpackh(DAG, dl, VT, V, V); 4841 EltNo -= NumElems/2; 4842 } 4843 NumElems >>= 1; 4844 } 4845 return V; 4846} 4847 4848/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4849static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4850 MVT VT = V.getSimpleValueType(); 4851 SDLoc dl(V); 4852 4853 if (VT.is128BitVector()) { 4854 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4855 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4856 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4857 &SplatMask[0]); 4858 } else if (VT.is256BitVector()) { 4859 // To use VPERMILPS to splat scalars, the second half of indicies must 4860 // refer to the higher part, which is a duplication of the lower one, 4861 // because VPERMILPS can only handle in-lane permutations. 4862 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4863 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4864 4865 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4866 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4867 &SplatMask[0]); 4868 } else 4869 llvm_unreachable("Vector size not supported"); 4870 4871 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4872} 4873 4874/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4875static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4876 MVT SrcVT = SV->getSimpleValueType(0); 4877 SDValue V1 = SV->getOperand(0); 4878 SDLoc dl(SV); 4879 4880 int EltNo = SV->getSplatIndex(); 4881 int NumElems = SrcVT.getVectorNumElements(); 4882 bool Is256BitVec = SrcVT.is256BitVector(); 4883 4884 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && 4885 "Unknown how to promote splat for type"); 4886 4887 // Extract the 128-bit part containing the splat element and update 4888 // the splat element index when it refers to the higher register. 4889 if (Is256BitVec) { 4890 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4891 if (EltNo >= NumElems/2) 4892 EltNo -= NumElems/2; 4893 } 4894 4895 // All i16 and i8 vector types can't be used directly by a generic shuffle 4896 // instruction because the target has no such instruction. Generate shuffles 4897 // which repeat i16 and i8 several times until they fit in i32, and then can 4898 // be manipulated by target suported shuffles. 4899 MVT EltVT = SrcVT.getVectorElementType(); 4900 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4901 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4902 4903 // Recreate the 256-bit vector and place the same 128-bit vector 4904 // into the low and high part. This is necessary because we want 4905 // to use VPERM* to shuffle the vectors 4906 if (Is256BitVec) { 4907 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4908 } 4909 4910 return getLegalSplat(DAG, V1, EltNo); 4911} 4912 4913/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4914/// vector of zero or undef vector. This produces a shuffle where the low 4915/// element of V2 is swizzled into the zero/undef vector, landing at element 4916/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4917static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4918 bool IsZero, 4919 const X86Subtarget *Subtarget, 4920 SelectionDAG &DAG) { 4921 MVT VT = V2.getSimpleValueType(); 4922 SDValue V1 = IsZero 4923 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4924 unsigned NumElems = VT.getVectorNumElements(); 4925 SmallVector<int, 16> MaskVec; 4926 for (unsigned i = 0; i != NumElems; ++i) 4927 // If this is the insertion idx, put the low elt of V2 here. 4928 MaskVec.push_back(i == Idx ? NumElems : i); 4929 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 4930} 4931 4932/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4933/// target specific opcode. Returns true if the Mask could be calculated. 4934/// Sets IsUnary to true if only uses one source. 4935static bool getTargetShuffleMask(SDNode *N, MVT VT, 4936 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4937 unsigned NumElems = VT.getVectorNumElements(); 4938 SDValue ImmN; 4939 4940 IsUnary = false; 4941 switch(N->getOpcode()) { 4942 case X86ISD::SHUFP: 4943 ImmN = N->getOperand(N->getNumOperands()-1); 4944 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4945 break; 4946 case X86ISD::UNPCKH: 4947 DecodeUNPCKHMask(VT, Mask); 4948 break; 4949 case X86ISD::UNPCKL: 4950 DecodeUNPCKLMask(VT, Mask); 4951 break; 4952 case X86ISD::MOVHLPS: 4953 DecodeMOVHLPSMask(NumElems, Mask); 4954 break; 4955 case X86ISD::MOVLHPS: 4956 DecodeMOVLHPSMask(NumElems, Mask); 4957 break; 4958 case X86ISD::PALIGNR: 4959 ImmN = N->getOperand(N->getNumOperands()-1); 4960 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4961 break; 4962 case X86ISD::PSHUFD: 4963 case X86ISD::VPERMILP: 4964 ImmN = N->getOperand(N->getNumOperands()-1); 4965 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4966 IsUnary = true; 4967 break; 4968 case X86ISD::PSHUFHW: 4969 ImmN = N->getOperand(N->getNumOperands()-1); 4970 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4971 IsUnary = true; 4972 break; 4973 case X86ISD::PSHUFLW: 4974 ImmN = N->getOperand(N->getNumOperands()-1); 4975 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4976 IsUnary = true; 4977 break; 4978 case X86ISD::VPERMI: 4979 ImmN = N->getOperand(N->getNumOperands()-1); 4980 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4981 IsUnary = true; 4982 break; 4983 case X86ISD::MOVSS: 4984 case X86ISD::MOVSD: { 4985 // The index 0 always comes from the first element of the second source, 4986 // this is why MOVSS and MOVSD are used in the first place. The other 4987 // elements come from the other positions of the first source vector 4988 Mask.push_back(NumElems); 4989 for (unsigned i = 1; i != NumElems; ++i) { 4990 Mask.push_back(i); 4991 } 4992 break; 4993 } 4994 case X86ISD::VPERM2X128: 4995 ImmN = N->getOperand(N->getNumOperands()-1); 4996 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4997 if (Mask.empty()) return false; 4998 break; 4999 case X86ISD::MOVDDUP: 5000 case X86ISD::MOVLHPD: 5001 case X86ISD::MOVLPD: 5002 case X86ISD::MOVLPS: 5003 case X86ISD::MOVSHDUP: 5004 case X86ISD::MOVSLDUP: 5005 // Not yet implemented 5006 return false; 5007 default: llvm_unreachable("unknown target shuffle node"); 5008 } 5009 5010 return true; 5011} 5012 5013/// getShuffleScalarElt - Returns the scalar element that will make up the ith 5014/// element of the result of the vector shuffle. 5015static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 5016 unsigned Depth) { 5017 if (Depth == 6) 5018 return SDValue(); // Limit search depth. 5019 5020 SDValue V = SDValue(N, 0); 5021 EVT VT = V.getValueType(); 5022 unsigned Opcode = V.getOpcode(); 5023 5024 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 5025 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 5026 int Elt = SV->getMaskElt(Index); 5027 5028 if (Elt < 0) 5029 return DAG.getUNDEF(VT.getVectorElementType()); 5030 5031 unsigned NumElems = VT.getVectorNumElements(); 5032 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 5033 : SV->getOperand(1); 5034 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 5035 } 5036 5037 // Recurse into target specific vector shuffles to find scalars. 5038 if (isTargetShuffle(Opcode)) { 5039 MVT ShufVT = V.getSimpleValueType(); 5040 unsigned NumElems = ShufVT.getVectorNumElements(); 5041 SmallVector<int, 16> ShuffleMask; 5042 bool IsUnary; 5043 5044 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 5045 return SDValue(); 5046 5047 int Elt = ShuffleMask[Index]; 5048 if (Elt < 0) 5049 return DAG.getUNDEF(ShufVT.getVectorElementType()); 5050 5051 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 5052 : N->getOperand(1); 5053 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 5054 Depth+1); 5055 } 5056 5057 // Actual nodes that may contain scalar elements 5058 if (Opcode == ISD::BITCAST) { 5059 V = V.getOperand(0); 5060 EVT SrcVT = V.getValueType(); 5061 unsigned NumElems = VT.getVectorNumElements(); 5062 5063 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 5064 return SDValue(); 5065 } 5066 5067 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5068 return (Index == 0) ? V.getOperand(0) 5069 : DAG.getUNDEF(VT.getVectorElementType()); 5070 5071 if (V.getOpcode() == ISD::BUILD_VECTOR) 5072 return V.getOperand(Index); 5073 5074 return SDValue(); 5075} 5076 5077/// getNumOfConsecutiveZeros - Return the number of elements of a vector 5078/// shuffle operation which come from a consecutively from a zero. The 5079/// search can start in two different directions, from left or right. 5080/// We count undefs as zeros until PreferredNum is reached. 5081static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, 5082 unsigned NumElems, bool ZerosFromLeft, 5083 SelectionDAG &DAG, 5084 unsigned PreferredNum = -1U) { 5085 unsigned NumZeros = 0; 5086 for (unsigned i = 0; i != NumElems; ++i) { 5087 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; 5088 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 5089 if (!Elt.getNode()) 5090 break; 5091 5092 if (X86::isZeroNode(Elt)) 5093 ++NumZeros; 5094 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. 5095 NumZeros = std::min(NumZeros + 1, PreferredNum); 5096 else 5097 break; 5098 } 5099 5100 return NumZeros; 5101} 5102 5103/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 5104/// correspond consecutively to elements from one of the vector operands, 5105/// starting from its index OpIdx. Also tell OpNum which source vector operand. 5106static 5107bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 5108 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 5109 unsigned NumElems, unsigned &OpNum) { 5110 bool SeenV1 = false; 5111 bool SeenV2 = false; 5112 5113 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 5114 int Idx = SVOp->getMaskElt(i); 5115 // Ignore undef indicies 5116 if (Idx < 0) 5117 continue; 5118 5119 if (Idx < (int)NumElems) 5120 SeenV1 = true; 5121 else 5122 SeenV2 = true; 5123 5124 // Only accept consecutive elements from the same vector 5125 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 5126 return false; 5127 } 5128 5129 OpNum = SeenV1 ? 0 : 1; 5130 return true; 5131} 5132 5133/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 5134/// logical left shift of a vector. 5135static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5136 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5137 unsigned NumElems = 5138 SVOp->getSimpleValueType(0).getVectorNumElements(); 5139 unsigned NumZeros = getNumOfConsecutiveZeros( 5140 SVOp, NumElems, false /* check zeros from right */, DAG, 5141 SVOp->getMaskElt(0)); 5142 unsigned OpSrc; 5143 5144 if (!NumZeros) 5145 return false; 5146 5147 // Considering the elements in the mask that are not consecutive zeros, 5148 // check if they consecutively come from only one of the source vectors. 5149 // 5150 // V1 = {X, A, B, C} 0 5151 // \ \ \ / 5152 // vector_shuffle V1, V2 <1, 2, 3, X> 5153 // 5154 if (!isShuffleMaskConsecutive(SVOp, 5155 0, // Mask Start Index 5156 NumElems-NumZeros, // Mask End Index(exclusive) 5157 NumZeros, // Where to start looking in the src vector 5158 NumElems, // Number of elements in vector 5159 OpSrc)) // Which source operand ? 5160 return false; 5161 5162 isLeft = false; 5163 ShAmt = NumZeros; 5164 ShVal = SVOp->getOperand(OpSrc); 5165 return true; 5166} 5167 5168/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 5169/// logical left shift of a vector. 5170static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5171 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5172 unsigned NumElems = 5173 SVOp->getSimpleValueType(0).getVectorNumElements(); 5174 unsigned NumZeros = getNumOfConsecutiveZeros( 5175 SVOp, NumElems, true /* check zeros from left */, DAG, 5176 NumElems - SVOp->getMaskElt(NumElems - 1) - 1); 5177 unsigned OpSrc; 5178 5179 if (!NumZeros) 5180 return false; 5181 5182 // Considering the elements in the mask that are not consecutive zeros, 5183 // check if they consecutively come from only one of the source vectors. 5184 // 5185 // 0 { A, B, X, X } = V2 5186 // / \ / / 5187 // vector_shuffle V1, V2 <X, X, 4, 5> 5188 // 5189 if (!isShuffleMaskConsecutive(SVOp, 5190 NumZeros, // Mask Start Index 5191 NumElems, // Mask End Index(exclusive) 5192 0, // Where to start looking in the src vector 5193 NumElems, // Number of elements in vector 5194 OpSrc)) // Which source operand ? 5195 return false; 5196 5197 isLeft = true; 5198 ShAmt = NumZeros; 5199 ShVal = SVOp->getOperand(OpSrc); 5200 return true; 5201} 5202 5203/// isVectorShift - Returns true if the shuffle can be implemented as a 5204/// logical left or right shift of a vector. 5205static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5206 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5207 // Although the logic below support any bitwidth size, there are no 5208 // shift instructions which handle more than 128-bit vectors. 5209 if (!SVOp->getSimpleValueType(0).is128BitVector()) 5210 return false; 5211 5212 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 5213 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 5214 return true; 5215 5216 return false; 5217} 5218 5219/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 5220/// 5221static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 5222 unsigned NumNonZero, unsigned NumZero, 5223 SelectionDAG &DAG, 5224 const X86Subtarget* Subtarget, 5225 const TargetLowering &TLI) { 5226 if (NumNonZero > 8) 5227 return SDValue(); 5228 5229 SDLoc dl(Op); 5230 SDValue V(0, 0); 5231 bool First = true; 5232 for (unsigned i = 0; i < 16; ++i) { 5233 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 5234 if (ThisIsNonZero && First) { 5235 if (NumZero) 5236 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5237 else 5238 V = DAG.getUNDEF(MVT::v8i16); 5239 First = false; 5240 } 5241 5242 if ((i & 1) != 0) { 5243 SDValue ThisElt(0, 0), LastElt(0, 0); 5244 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 5245 if (LastIsNonZero) { 5246 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 5247 MVT::i16, Op.getOperand(i-1)); 5248 } 5249 if (ThisIsNonZero) { 5250 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 5251 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 5252 ThisElt, DAG.getConstant(8, MVT::i8)); 5253 if (LastIsNonZero) 5254 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 5255 } else 5256 ThisElt = LastElt; 5257 5258 if (ThisElt.getNode()) 5259 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 5260 DAG.getIntPtrConstant(i/2)); 5261 } 5262 } 5263 5264 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 5265} 5266 5267/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 5268/// 5269static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 5270 unsigned NumNonZero, unsigned NumZero, 5271 SelectionDAG &DAG, 5272 const X86Subtarget* Subtarget, 5273 const TargetLowering &TLI) { 5274 if (NumNonZero > 4) 5275 return SDValue(); 5276 5277 SDLoc dl(Op); 5278 SDValue V(0, 0); 5279 bool First = true; 5280 for (unsigned i = 0; i < 8; ++i) { 5281 bool isNonZero = (NonZeros & (1 << i)) != 0; 5282 if (isNonZero) { 5283 if (First) { 5284 if (NumZero) 5285 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5286 else 5287 V = DAG.getUNDEF(MVT::v8i16); 5288 First = false; 5289 } 5290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5291 MVT::v8i16, V, Op.getOperand(i), 5292 DAG.getIntPtrConstant(i)); 5293 } 5294 } 5295 5296 return V; 5297} 5298 5299/// getVShift - Return a vector logical shift node. 5300/// 5301static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 5302 unsigned NumBits, SelectionDAG &DAG, 5303 const TargetLowering &TLI, SDLoc dl) { 5304 assert(VT.is128BitVector() && "Unknown type for VShift"); 5305 EVT ShVT = MVT::v2i64; 5306 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 5307 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 5308 return DAG.getNode(ISD::BITCAST, dl, VT, 5309 DAG.getNode(Opc, dl, ShVT, SrcOp, 5310 DAG.getConstant(NumBits, 5311 TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); 5312} 5313 5314static SDValue 5315LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 5316 5317 // Check if the scalar load can be widened into a vector load. And if 5318 // the address is "base + cst" see if the cst can be "absorbed" into 5319 // the shuffle mask. 5320 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 5321 SDValue Ptr = LD->getBasePtr(); 5322 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 5323 return SDValue(); 5324 EVT PVT = LD->getValueType(0); 5325 if (PVT != MVT::i32 && PVT != MVT::f32) 5326 return SDValue(); 5327 5328 int FI = -1; 5329 int64_t Offset = 0; 5330 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 5331 FI = FINode->getIndex(); 5332 Offset = 0; 5333 } else if (DAG.isBaseWithConstantOffset(Ptr) && 5334 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 5335 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 5336 Offset = Ptr.getConstantOperandVal(1); 5337 Ptr = Ptr.getOperand(0); 5338 } else { 5339 return SDValue(); 5340 } 5341 5342 // FIXME: 256-bit vector instructions don't require a strict alignment, 5343 // improve this code to support it better. 5344 unsigned RequiredAlign = VT.getSizeInBits()/8; 5345 SDValue Chain = LD->getChain(); 5346 // Make sure the stack object alignment is at least 16 or 32. 5347 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5348 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5349 if (MFI->isFixedObjectIndex(FI)) { 5350 // Can't change the alignment. FIXME: It's possible to compute 5351 // the exact stack offset and reference FI + adjust offset instead. 5352 // If someone *really* cares about this. That's the way to implement it. 5353 return SDValue(); 5354 } else { 5355 MFI->setObjectAlignment(FI, RequiredAlign); 5356 } 5357 } 5358 5359 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5360 // Ptr + (Offset & ~15). 5361 if (Offset < 0) 5362 return SDValue(); 5363 if ((Offset % RequiredAlign) & 3) 5364 return SDValue(); 5365 int64_t StartOffset = Offset & ~(RequiredAlign-1); 5366 if (StartOffset) 5367 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), 5368 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 5369 5370 int EltNo = (Offset - StartOffset) >> 2; 5371 unsigned NumElems = VT.getVectorNumElements(); 5372 5373 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5374 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5375 LD->getPointerInfo().getWithOffset(StartOffset), 5376 false, false, false, 0); 5377 5378 SmallVector<int, 8> Mask; 5379 for (unsigned i = 0; i != NumElems; ++i) 5380 Mask.push_back(EltNo); 5381 5382 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 5383 } 5384 5385 return SDValue(); 5386} 5387 5388/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 5389/// vector of type 'VT', see if the elements can be replaced by a single large 5390/// load which has the same value as a build_vector whose operands are 'elts'. 5391/// 5392/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5393/// 5394/// FIXME: we'd also like to handle the case where the last elements are zero 5395/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5396/// There's even a handy isZeroNode for that purpose. 5397static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 5398 SDLoc &DL, SelectionDAG &DAG) { 5399 EVT EltVT = VT.getVectorElementType(); 5400 unsigned NumElems = Elts.size(); 5401 5402 LoadSDNode *LDBase = NULL; 5403 unsigned LastLoadedElt = -1U; 5404 5405 // For each element in the initializer, see if we've found a load or an undef. 5406 // If we don't find an initial load element, or later load elements are 5407 // non-consecutive, bail out. 5408 for (unsigned i = 0; i < NumElems; ++i) { 5409 SDValue Elt = Elts[i]; 5410 5411 if (!Elt.getNode() || 5412 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5413 return SDValue(); 5414 if (!LDBase) { 5415 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5416 return SDValue(); 5417 LDBase = cast<LoadSDNode>(Elt.getNode()); 5418 LastLoadedElt = i; 5419 continue; 5420 } 5421 if (Elt.getOpcode() == ISD::UNDEF) 5422 continue; 5423 5424 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5425 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 5426 return SDValue(); 5427 LastLoadedElt = i; 5428 } 5429 5430 // If we have found an entire vector of loads and undefs, then return a large 5431 // load of the entire vector width starting at the base pointer. If we found 5432 // consecutive loads for the low half, generate a vzext_load node. 5433 if (LastLoadedElt == NumElems - 1) { 5434 SDValue NewLd = SDValue(); 5435 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 5436 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5437 LDBase->getPointerInfo(), 5438 LDBase->isVolatile(), LDBase->isNonTemporal(), 5439 LDBase->isInvariant(), 0); 5440 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5441 LDBase->getPointerInfo(), 5442 LDBase->isVolatile(), LDBase->isNonTemporal(), 5443 LDBase->isInvariant(), LDBase->getAlignment()); 5444 5445 if (LDBase->hasAnyUseOfValue(1)) { 5446 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5447 SDValue(LDBase, 1), 5448 SDValue(NewLd.getNode(), 1)); 5449 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5450 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5451 SDValue(NewLd.getNode(), 1)); 5452 } 5453 5454 return NewLd; 5455 } 5456 if (NumElems == 4 && LastLoadedElt == 1 && 5457 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5458 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5459 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5460 SDValue ResNode = 5461 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 5462 array_lengthof(Ops), MVT::i64, 5463 LDBase->getPointerInfo(), 5464 LDBase->getAlignment(), 5465 false/*isVolatile*/, true/*ReadMem*/, 5466 false/*WriteMem*/); 5467 5468 // Make sure the newly-created LOAD is in the same position as LDBase in 5469 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5470 // update uses of LDBase's output chain to use the TokenFactor. 5471 if (LDBase->hasAnyUseOfValue(1)) { 5472 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5473 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5474 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5475 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5476 SDValue(ResNode.getNode(), 1)); 5477 } 5478 5479 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5480 } 5481 return SDValue(); 5482} 5483 5484/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5485/// to generate a splat value for the following cases: 5486/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5487/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5488/// a scalar load, or a constant. 5489/// The VBROADCAST node is returned when a pattern is found, 5490/// or SDValue() otherwise. 5491static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 5492 SelectionDAG &DAG) { 5493 if (!Subtarget->hasFp256()) 5494 return SDValue(); 5495 5496 MVT VT = Op.getSimpleValueType(); 5497 SDLoc dl(Op); 5498 5499 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 5500 "Unsupported vector type for broadcast."); 5501 5502 SDValue Ld; 5503 bool ConstSplatVal; 5504 5505 switch (Op.getOpcode()) { 5506 default: 5507 // Unknown pattern found. 5508 return SDValue(); 5509 5510 case ISD::BUILD_VECTOR: { 5511 // The BUILD_VECTOR node must be a splat. 5512 if (!isSplatVector(Op.getNode())) 5513 return SDValue(); 5514 5515 Ld = Op.getOperand(0); 5516 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5517 Ld.getOpcode() == ISD::ConstantFP); 5518 5519 // The suspected load node has several users. Make sure that all 5520 // of its users are from the BUILD_VECTOR node. 5521 // Constants may have multiple users. 5522 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5523 return SDValue(); 5524 break; 5525 } 5526 5527 case ISD::VECTOR_SHUFFLE: { 5528 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5529 5530 // Shuffles must have a splat mask where the first element is 5531 // broadcasted. 5532 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5533 return SDValue(); 5534 5535 SDValue Sc = Op.getOperand(0); 5536 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5537 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5538 5539 if (!Subtarget->hasInt256()) 5540 return SDValue(); 5541 5542 // Use the register form of the broadcast instruction available on AVX2. 5543 if (VT.getSizeInBits() >= 256) 5544 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5545 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5546 } 5547 5548 Ld = Sc.getOperand(0); 5549 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5550 Ld.getOpcode() == ISD::ConstantFP); 5551 5552 // The scalar_to_vector node and the suspected 5553 // load node must have exactly one user. 5554 // Constants may have multiple users. 5555 5556 // AVX-512 has register version of the broadcast 5557 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 5558 Ld.getValueType().getSizeInBits() >= 32; 5559 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 5560 !hasRegVer)) 5561 return SDValue(); 5562 break; 5563 } 5564 } 5565 5566 bool IsGE256 = (VT.getSizeInBits() >= 256); 5567 5568 // Handle the broadcasting a single constant scalar from the constant pool 5569 // into a vector. On Sandybridge it is still better to load a constant vector 5570 // from the constant pool and not to broadcast it from a scalar. 5571 if (ConstSplatVal && Subtarget->hasInt256()) { 5572 EVT CVT = Ld.getValueType(); 5573 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5574 unsigned ScalarSize = CVT.getSizeInBits(); 5575 5576 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { 5577 const Constant *C = 0; 5578 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5579 C = CI->getConstantIntValue(); 5580 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5581 C = CF->getConstantFPValue(); 5582 5583 assert(C && "Invalid constant type"); 5584 5585 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5586 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 5587 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5588 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5589 MachinePointerInfo::getConstantPool(), 5590 false, false, false, Alignment); 5591 5592 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5593 } 5594 } 5595 5596 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5597 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5598 5599 // Handle AVX2 in-register broadcasts. 5600 if (!IsLoad && Subtarget->hasInt256() && 5601 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5602 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5603 5604 // The scalar source must be a normal load. 5605 if (!IsLoad) 5606 return SDValue(); 5607 5608 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) 5609 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5610 5611 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5612 // double since there is no vbroadcastsd xmm 5613 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5614 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5615 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5616 } 5617 5618 // Unsupported broadcast. 5619 return SDValue(); 5620} 5621 5622static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5623 MVT VT = Op.getSimpleValueType(); 5624 5625 // Skip if insert_vec_elt is not supported. 5626 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5627 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5628 return SDValue(); 5629 5630 SDLoc DL(Op); 5631 unsigned NumElems = Op.getNumOperands(); 5632 5633 SDValue VecIn1; 5634 SDValue VecIn2; 5635 SmallVector<unsigned, 4> InsertIndices; 5636 SmallVector<int, 8> Mask(NumElems, -1); 5637 5638 for (unsigned i = 0; i != NumElems; ++i) { 5639 unsigned Opc = Op.getOperand(i).getOpcode(); 5640 5641 if (Opc == ISD::UNDEF) 5642 continue; 5643 5644 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5645 // Quit if more than 1 elements need inserting. 5646 if (InsertIndices.size() > 1) 5647 return SDValue(); 5648 5649 InsertIndices.push_back(i); 5650 continue; 5651 } 5652 5653 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5654 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5655 5656 // Quit if extracted from vector of different type. 5657 if (ExtractedFromVec.getValueType() != VT) 5658 return SDValue(); 5659 5660 // Quit if non-constant index. 5661 if (!isa<ConstantSDNode>(ExtIdx)) 5662 return SDValue(); 5663 5664 if (VecIn1.getNode() == 0) 5665 VecIn1 = ExtractedFromVec; 5666 else if (VecIn1 != ExtractedFromVec) { 5667 if (VecIn2.getNode() == 0) 5668 VecIn2 = ExtractedFromVec; 5669 else if (VecIn2 != ExtractedFromVec) 5670 // Quit if more than 2 vectors to shuffle 5671 return SDValue(); 5672 } 5673 5674 unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5675 5676 if (ExtractedFromVec == VecIn1) 5677 Mask[i] = Idx; 5678 else if (ExtractedFromVec == VecIn2) 5679 Mask[i] = Idx + NumElems; 5680 } 5681 5682 if (VecIn1.getNode() == 0) 5683 return SDValue(); 5684 5685 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5686 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5687 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5688 unsigned Idx = InsertIndices[i]; 5689 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5690 DAG.getIntPtrConstant(Idx)); 5691 } 5692 5693 return NV; 5694} 5695 5696// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 5697SDValue 5698X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 5699 5700 MVT VT = Op.getSimpleValueType(); 5701 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && 5702 "Unexpected type in LowerBUILD_VECTORvXi1!"); 5703 5704 SDLoc dl(Op); 5705 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5706 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 5707 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5708 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5709 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5710 Ops, VT.getVectorNumElements()); 5711 } 5712 5713 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5714 SDValue Cst = DAG.getTargetConstant(1, MVT::i1); 5715 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5716 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5717 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5718 Ops, VT.getVectorNumElements()); 5719 } 5720 5721 bool AllContants = true; 5722 uint64_t Immediate = 0; 5723 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5724 SDValue In = Op.getOperand(idx); 5725 if (In.getOpcode() == ISD::UNDEF) 5726 continue; 5727 if (!isa<ConstantSDNode>(In)) { 5728 AllContants = false; 5729 break; 5730 } 5731 if (cast<ConstantSDNode>(In)->getZExtValue()) 5732 Immediate |= (1ULL << idx); 5733 } 5734 5735 if (AllContants) { 5736 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, 5737 DAG.getConstant(Immediate, MVT::i16)); 5738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, 5739 DAG.getIntPtrConstant(0)); 5740 } 5741 5742 // Splat vector (with undefs) 5743 SDValue In = Op.getOperand(0); 5744 for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { 5745 if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) 5746 llvm_unreachable("Unsupported predicate operation"); 5747 } 5748 5749 SDValue EFLAGS, X86CC; 5750 if (In.getOpcode() == ISD::SETCC) { 5751 SDValue Op0 = In.getOperand(0); 5752 SDValue Op1 = In.getOperand(1); 5753 ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); 5754 bool isFP = Op1.getValueType().isFloatingPoint(); 5755 unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5756 5757 assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); 5758 5759 X86CC = DAG.getConstant(X86CCVal, MVT::i8); 5760 EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); 5761 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 5762 } else if (In.getOpcode() == X86ISD::SETCC) { 5763 X86CC = In.getOperand(0); 5764 EFLAGS = In.getOperand(1); 5765 } else { 5766 // The algorithm: 5767 // Bit1 = In & 0x1 5768 // if (Bit1 != 0) 5769 // ZF = 0 5770 // else 5771 // ZF = 1 5772 // if (ZF == 0) 5773 // res = allOnes ### CMOVNE -1, %res 5774 // else 5775 // res = allZero 5776 MVT InVT = In.getSimpleValueType(); 5777 SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); 5778 EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); 5779 X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5780 } 5781 5782 if (VT == MVT::v16i1) { 5783 SDValue Cst1 = DAG.getConstant(-1, MVT::i16); 5784 SDValue Cst0 = DAG.getConstant(0, MVT::i16); 5785 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, 5786 Cst0, Cst1, X86CC, EFLAGS); 5787 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5788 } 5789 5790 if (VT == MVT::v8i1) { 5791 SDValue Cst1 = DAG.getConstant(-1, MVT::i32); 5792 SDValue Cst0 = DAG.getConstant(0, MVT::i32); 5793 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, 5794 Cst0, Cst1, X86CC, EFLAGS); 5795 CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); 5796 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5797 } 5798 llvm_unreachable("Unsupported predicate operation"); 5799} 5800 5801SDValue 5802X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5803 SDLoc dl(Op); 5804 5805 MVT VT = Op.getSimpleValueType(); 5806 MVT ExtVT = VT.getVectorElementType(); 5807 unsigned NumElems = Op.getNumOperands(); 5808 5809 // Generate vectors for predicate vectors. 5810 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) 5811 return LowerBUILD_VECTORvXi1(Op, DAG); 5812 5813 // Vectors containing all zeros can be matched by pxor and xorps later 5814 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5815 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5816 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5817 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 5818 return Op; 5819 5820 return getZeroVector(VT, Subtarget, DAG, dl); 5821 } 5822 5823 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5824 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5825 // vpcmpeqd on 256-bit vectors. 5826 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 5827 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5828 return Op; 5829 5830 if (!VT.is512BitVector()) 5831 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5832 } 5833 5834 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 5835 if (Broadcast.getNode()) 5836 return Broadcast; 5837 5838 unsigned EVTBits = ExtVT.getSizeInBits(); 5839 5840 unsigned NumZero = 0; 5841 unsigned NumNonZero = 0; 5842 unsigned NonZeros = 0; 5843 bool IsAllConstants = true; 5844 SmallSet<SDValue, 8> Values; 5845 for (unsigned i = 0; i < NumElems; ++i) { 5846 SDValue Elt = Op.getOperand(i); 5847 if (Elt.getOpcode() == ISD::UNDEF) 5848 continue; 5849 Values.insert(Elt); 5850 if (Elt.getOpcode() != ISD::Constant && 5851 Elt.getOpcode() != ISD::ConstantFP) 5852 IsAllConstants = false; 5853 if (X86::isZeroNode(Elt)) 5854 NumZero++; 5855 else { 5856 NonZeros |= (1 << i); 5857 NumNonZero++; 5858 } 5859 } 5860 5861 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5862 if (NumNonZero == 0) 5863 return DAG.getUNDEF(VT); 5864 5865 // Special case for single non-zero, non-undef, element. 5866 if (NumNonZero == 1) { 5867 unsigned Idx = countTrailingZeros(NonZeros); 5868 SDValue Item = Op.getOperand(Idx); 5869 5870 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5871 // the value are obviously zero, truncate the value to i32 and do the 5872 // insertion that way. Only do this if the value is non-constant or if the 5873 // value is a constant being inserted into element 0. It is cheaper to do 5874 // a constant pool load than it is to do a movd + shuffle. 5875 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5876 (!IsAllConstants || Idx == 0)) { 5877 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5878 // Handle SSE only. 5879 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5880 EVT VecVT = MVT::v4i32; 5881 unsigned VecElts = 4; 5882 5883 // Truncate the value (which may itself be a constant) to i32, and 5884 // convert it to a vector with movd (S2V+shuffle to zero extend). 5885 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5886 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5887 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5888 5889 // Now we have our 32-bit value zero extended in the low element of 5890 // a vector. If Idx != 0, swizzle it into place. 5891 if (Idx != 0) { 5892 SmallVector<int, 4> Mask; 5893 Mask.push_back(Idx); 5894 for (unsigned i = 1; i != VecElts; ++i) 5895 Mask.push_back(i); 5896 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5897 &Mask[0]); 5898 } 5899 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5900 } 5901 } 5902 5903 // If we have a constant or non-constant insertion into the low element of 5904 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5905 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5906 // depending on what the source datatype is. 5907 if (Idx == 0) { 5908 if (NumZero == 0) 5909 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5910 5911 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5912 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5913 if (VT.is256BitVector() || VT.is512BitVector()) { 5914 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5915 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5916 Item, DAG.getIntPtrConstant(0)); 5917 } 5918 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5919 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5920 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5921 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5922 } 5923 5924 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5925 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5926 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5927 if (VT.is256BitVector()) { 5928 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5929 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5930 } else { 5931 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5932 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5933 } 5934 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5935 } 5936 } 5937 5938 // Is it a vector logical left shift? 5939 if (NumElems == 2 && Idx == 1 && 5940 X86::isZeroNode(Op.getOperand(0)) && 5941 !X86::isZeroNode(Op.getOperand(1))) { 5942 unsigned NumBits = VT.getSizeInBits(); 5943 return getVShift(true, VT, 5944 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5945 VT, Op.getOperand(1)), 5946 NumBits/2, DAG, *this, dl); 5947 } 5948 5949 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5950 return SDValue(); 5951 5952 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5953 // is a non-constant being inserted into an element other than the low one, 5954 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5955 // movd/movss) to move this into the low element, then shuffle it into 5956 // place. 5957 if (EVTBits == 32) { 5958 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5959 5960 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5961 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5962 SmallVector<int, 8> MaskVec; 5963 for (unsigned i = 0; i != NumElems; ++i) 5964 MaskVec.push_back(i == Idx ? 0 : 1); 5965 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5966 } 5967 } 5968 5969 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5970 if (Values.size() == 1) { 5971 if (EVTBits == 32) { 5972 // Instead of a shuffle like this: 5973 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5974 // Check if it's possible to issue this instead. 5975 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5976 unsigned Idx = countTrailingZeros(NonZeros); 5977 SDValue Item = Op.getOperand(Idx); 5978 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5979 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5980 } 5981 return SDValue(); 5982 } 5983 5984 // A vector full of immediates; various special cases are already 5985 // handled, so this is best done with a single constant-pool load. 5986 if (IsAllConstants) 5987 return SDValue(); 5988 5989 // For AVX-length vectors, build the individual 128-bit pieces and use 5990 // shuffles to put them in place. 5991 if (VT.is256BitVector()) { 5992 SmallVector<SDValue, 32> V; 5993 for (unsigned i = 0; i != NumElems; ++i) 5994 V.push_back(Op.getOperand(i)); 5995 5996 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5997 5998 // Build both the lower and upper subvector. 5999 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 6000 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 6001 NumElems/2); 6002 6003 // Recreate the wider vector with the lower and upper part. 6004 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6005 } 6006 6007 // Let legalizer expand 2-wide build_vectors. 6008 if (EVTBits == 64) { 6009 if (NumNonZero == 1) { 6010 // One half is zero or undef. 6011 unsigned Idx = countTrailingZeros(NonZeros); 6012 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 6013 Op.getOperand(Idx)); 6014 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 6015 } 6016 return SDValue(); 6017 } 6018 6019 // If element VT is < 32 bits, convert it to inserts into a zero vector. 6020 if (EVTBits == 8 && NumElems == 16) { 6021 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 6022 Subtarget, *this); 6023 if (V.getNode()) return V; 6024 } 6025 6026 if (EVTBits == 16 && NumElems == 8) { 6027 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 6028 Subtarget, *this); 6029 if (V.getNode()) return V; 6030 } 6031 6032 // If element VT is == 32 bits, turn it into a number of shuffles. 6033 SmallVector<SDValue, 8> V(NumElems); 6034 if (NumElems == 4 && NumZero > 0) { 6035 for (unsigned i = 0; i < 4; ++i) { 6036 bool isZero = !(NonZeros & (1 << i)); 6037 if (isZero) 6038 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 6039 else 6040 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6041 } 6042 6043 for (unsigned i = 0; i < 2; ++i) { 6044 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 6045 default: break; 6046 case 0: 6047 V[i] = V[i*2]; // Must be a zero vector. 6048 break; 6049 case 1: 6050 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 6051 break; 6052 case 2: 6053 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 6054 break; 6055 case 3: 6056 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 6057 break; 6058 } 6059 } 6060 6061 bool Reverse1 = (NonZeros & 0x3) == 2; 6062 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 6063 int MaskVec[] = { 6064 Reverse1 ? 1 : 0, 6065 Reverse1 ? 0 : 1, 6066 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 6067 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 6068 }; 6069 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 6070 } 6071 6072 if (Values.size() > 1 && VT.is128BitVector()) { 6073 // Check for a build vector of consecutive loads. 6074 for (unsigned i = 0; i < NumElems; ++i) 6075 V[i] = Op.getOperand(i); 6076 6077 // Check for elements which are consecutive loads. 6078 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 6079 if (LD.getNode()) 6080 return LD; 6081 6082 // Check for a build vector from mostly shuffle plus few inserting. 6083 SDValue Sh = buildFromShuffleMostly(Op, DAG); 6084 if (Sh.getNode()) 6085 return Sh; 6086 6087 // For SSE 4.1, use insertps to put the high elements into the low element. 6088 if (getSubtarget()->hasSSE41()) { 6089 SDValue Result; 6090 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 6091 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 6092 else 6093 Result = DAG.getUNDEF(VT); 6094 6095 for (unsigned i = 1; i < NumElems; ++i) { 6096 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 6097 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 6098 Op.getOperand(i), DAG.getIntPtrConstant(i)); 6099 } 6100 return Result; 6101 } 6102 6103 // Otherwise, expand into a number of unpckl*, start by extending each of 6104 // our (non-undef) elements to the full vector width with the element in the 6105 // bottom slot of the vector (which generates no code for SSE). 6106 for (unsigned i = 0; i < NumElems; ++i) { 6107 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 6108 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6109 else 6110 V[i] = DAG.getUNDEF(VT); 6111 } 6112 6113 // Next, we iteratively mix elements, e.g. for v4f32: 6114 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 6115 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 6116 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 6117 unsigned EltStride = NumElems >> 1; 6118 while (EltStride != 0) { 6119 for (unsigned i = 0; i < EltStride; ++i) { 6120 // If V[i+EltStride] is undef and this is the first round of mixing, 6121 // then it is safe to just drop this shuffle: V[i] is already in the 6122 // right place, the one element (since it's the first round) being 6123 // inserted as undef can be dropped. This isn't safe for successive 6124 // rounds because they will permute elements within both vectors. 6125 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 6126 EltStride == NumElems/2) 6127 continue; 6128 6129 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 6130 } 6131 EltStride >>= 1; 6132 } 6133 return V[0]; 6134 } 6135 return SDValue(); 6136} 6137 6138// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 6139// to create 256-bit vectors from two other 128-bit ones. 6140static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6141 SDLoc dl(Op); 6142 MVT ResVT = Op.getSimpleValueType(); 6143 6144 assert((ResVT.is256BitVector() || 6145 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 6146 6147 SDValue V1 = Op.getOperand(0); 6148 SDValue V2 = Op.getOperand(1); 6149 unsigned NumElems = ResVT.getVectorNumElements(); 6150 if(ResVT.is256BitVector()) 6151 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6152 6153 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6154} 6155 6156static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6157 assert(Op.getNumOperands() == 2); 6158 6159 // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors 6160 // from two other 128-bit ones. 6161 return LowerAVXCONCAT_VECTORS(Op, DAG); 6162} 6163 6164// Try to lower a shuffle node into a simple blend instruction. 6165static SDValue 6166LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 6167 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6168 SDValue V1 = SVOp->getOperand(0); 6169 SDValue V2 = SVOp->getOperand(1); 6170 SDLoc dl(SVOp); 6171 MVT VT = SVOp->getSimpleValueType(0); 6172 MVT EltVT = VT.getVectorElementType(); 6173 unsigned NumElems = VT.getVectorNumElements(); 6174 6175 // There is no blend with immediate in AVX-512. 6176 if (VT.is512BitVector()) 6177 return SDValue(); 6178 6179 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 6180 return SDValue(); 6181 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 6182 return SDValue(); 6183 6184 // Check the mask for BLEND and build the value. 6185 unsigned MaskValue = 0; 6186 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 6187 unsigned NumLanes = (NumElems-1)/8 + 1; 6188 unsigned NumElemsInLane = NumElems / NumLanes; 6189 6190 // Blend for v16i16 should be symetric for the both lanes. 6191 for (unsigned i = 0; i < NumElemsInLane; ++i) { 6192 6193 int SndLaneEltIdx = (NumLanes == 2) ? 6194 SVOp->getMaskElt(i + NumElemsInLane) : -1; 6195 int EltIdx = SVOp->getMaskElt(i); 6196 6197 if ((EltIdx < 0 || EltIdx == (int)i) && 6198 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 6199 continue; 6200 6201 if (((unsigned)EltIdx == (i + NumElems)) && 6202 (SndLaneEltIdx < 0 || 6203 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 6204 MaskValue |= (1<<i); 6205 else 6206 return SDValue(); 6207 } 6208 6209 // Convert i32 vectors to floating point if it is not AVX2. 6210 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 6211 MVT BlendVT = VT; 6212 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 6213 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 6214 NumElems); 6215 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 6216 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 6217 } 6218 6219 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 6220 DAG.getConstant(MaskValue, MVT::i32)); 6221 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 6222} 6223 6224// v8i16 shuffles - Prefer shuffles in the following order: 6225// 1. [all] pshuflw, pshufhw, optional move 6226// 2. [ssse3] 1 x pshufb 6227// 3. [ssse3] 2 x pshufb + 1 x por 6228// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 6229static SDValue 6230LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 6231 SelectionDAG &DAG) { 6232 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6233 SDValue V1 = SVOp->getOperand(0); 6234 SDValue V2 = SVOp->getOperand(1); 6235 SDLoc dl(SVOp); 6236 SmallVector<int, 8> MaskVals; 6237 6238 // Determine if more than 1 of the words in each of the low and high quadwords 6239 // of the result come from the same quadword of one of the two inputs. Undef 6240 // mask values count as coming from any quadword, for better codegen. 6241 unsigned LoQuad[] = { 0, 0, 0, 0 }; 6242 unsigned HiQuad[] = { 0, 0, 0, 0 }; 6243 std::bitset<4> InputQuads; 6244 for (unsigned i = 0; i < 8; ++i) { 6245 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 6246 int EltIdx = SVOp->getMaskElt(i); 6247 MaskVals.push_back(EltIdx); 6248 if (EltIdx < 0) { 6249 ++Quad[0]; 6250 ++Quad[1]; 6251 ++Quad[2]; 6252 ++Quad[3]; 6253 continue; 6254 } 6255 ++Quad[EltIdx / 4]; 6256 InputQuads.set(EltIdx / 4); 6257 } 6258 6259 int BestLoQuad = -1; 6260 unsigned MaxQuad = 1; 6261 for (unsigned i = 0; i < 4; ++i) { 6262 if (LoQuad[i] > MaxQuad) { 6263 BestLoQuad = i; 6264 MaxQuad = LoQuad[i]; 6265 } 6266 } 6267 6268 int BestHiQuad = -1; 6269 MaxQuad = 1; 6270 for (unsigned i = 0; i < 4; ++i) { 6271 if (HiQuad[i] > MaxQuad) { 6272 BestHiQuad = i; 6273 MaxQuad = HiQuad[i]; 6274 } 6275 } 6276 6277 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 6278 // of the two input vectors, shuffle them into one input vector so only a 6279 // single pshufb instruction is necessary. If There are more than 2 input 6280 // quads, disable the next transformation since it does not help SSSE3. 6281 bool V1Used = InputQuads[0] || InputQuads[1]; 6282 bool V2Used = InputQuads[2] || InputQuads[3]; 6283 if (Subtarget->hasSSSE3()) { 6284 if (InputQuads.count() == 2 && V1Used && V2Used) { 6285 BestLoQuad = InputQuads[0] ? 0 : 1; 6286 BestHiQuad = InputQuads[2] ? 2 : 3; 6287 } 6288 if (InputQuads.count() > 2) { 6289 BestLoQuad = -1; 6290 BestHiQuad = -1; 6291 } 6292 } 6293 6294 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 6295 // the shuffle mask. If a quad is scored as -1, that means that it contains 6296 // words from all 4 input quadwords. 6297 SDValue NewV; 6298 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 6299 int MaskV[] = { 6300 BestLoQuad < 0 ? 0 : BestLoQuad, 6301 BestHiQuad < 0 ? 1 : BestHiQuad 6302 }; 6303 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 6304 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 6305 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 6306 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 6307 6308 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 6309 // source words for the shuffle, to aid later transformations. 6310 bool AllWordsInNewV = true; 6311 bool InOrder[2] = { true, true }; 6312 for (unsigned i = 0; i != 8; ++i) { 6313 int idx = MaskVals[i]; 6314 if (idx != (int)i) 6315 InOrder[i/4] = false; 6316 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 6317 continue; 6318 AllWordsInNewV = false; 6319 break; 6320 } 6321 6322 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 6323 if (AllWordsInNewV) { 6324 for (int i = 0; i != 8; ++i) { 6325 int idx = MaskVals[i]; 6326 if (idx < 0) 6327 continue; 6328 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 6329 if ((idx != i) && idx < 4) 6330 pshufhw = false; 6331 if ((idx != i) && idx > 3) 6332 pshuflw = false; 6333 } 6334 V1 = NewV; 6335 V2Used = false; 6336 BestLoQuad = 0; 6337 BestHiQuad = 1; 6338 } 6339 6340 // If we've eliminated the use of V2, and the new mask is a pshuflw or 6341 // pshufhw, that's as cheap as it gets. Return the new shuffle. 6342 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 6343 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 6344 unsigned TargetMask = 0; 6345 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 6346 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 6347 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6348 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 6349 getShufflePSHUFLWImmediate(SVOp); 6350 V1 = NewV.getOperand(0); 6351 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 6352 } 6353 } 6354 6355 // Promote splats to a larger type which usually leads to more efficient code. 6356 // FIXME: Is this true if pshufb is available? 6357 if (SVOp->isSplat()) 6358 return PromoteSplat(SVOp, DAG); 6359 6360 // If we have SSSE3, and all words of the result are from 1 input vector, 6361 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 6362 // is present, fall back to case 4. 6363 if (Subtarget->hasSSSE3()) { 6364 SmallVector<SDValue,16> pshufbMask; 6365 6366 // If we have elements from both input vectors, set the high bit of the 6367 // shuffle mask element to zero out elements that come from V2 in the V1 6368 // mask, and elements that come from V1 in the V2 mask, so that the two 6369 // results can be OR'd together. 6370 bool TwoInputs = V1Used && V2Used; 6371 for (unsigned i = 0; i != 8; ++i) { 6372 int EltIdx = MaskVals[i] * 2; 6373 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 6374 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 6375 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6376 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6377 } 6378 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 6379 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6380 DAG.getNode(ISD::BUILD_VECTOR, dl, 6381 MVT::v16i8, &pshufbMask[0], 16)); 6382 if (!TwoInputs) 6383 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6384 6385 // Calculate the shuffle mask for the second input, shuffle it, and 6386 // OR it with the first shuffled input. 6387 pshufbMask.clear(); 6388 for (unsigned i = 0; i != 8; ++i) { 6389 int EltIdx = MaskVals[i] * 2; 6390 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6391 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 6392 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6393 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6394 } 6395 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 6396 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6397 DAG.getNode(ISD::BUILD_VECTOR, dl, 6398 MVT::v16i8, &pshufbMask[0], 16)); 6399 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6400 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6401 } 6402 6403 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 6404 // and update MaskVals with new element order. 6405 std::bitset<8> InOrder; 6406 if (BestLoQuad >= 0) { 6407 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 6408 for (int i = 0; i != 4; ++i) { 6409 int idx = MaskVals[i]; 6410 if (idx < 0) { 6411 InOrder.set(i); 6412 } else if ((idx / 4) == BestLoQuad) { 6413 MaskV[i] = idx & 3; 6414 InOrder.set(i); 6415 } 6416 } 6417 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6418 &MaskV[0]); 6419 6420 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6421 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6422 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 6423 NewV.getOperand(0), 6424 getShufflePSHUFLWImmediate(SVOp), DAG); 6425 } 6426 } 6427 6428 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 6429 // and update MaskVals with the new element order. 6430 if (BestHiQuad >= 0) { 6431 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 6432 for (unsigned i = 4; i != 8; ++i) { 6433 int idx = MaskVals[i]; 6434 if (idx < 0) { 6435 InOrder.set(i); 6436 } else if ((idx / 4) == BestHiQuad) { 6437 MaskV[i] = (idx & 3) + 4; 6438 InOrder.set(i); 6439 } 6440 } 6441 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6442 &MaskV[0]); 6443 6444 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6445 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6446 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 6447 NewV.getOperand(0), 6448 getShufflePSHUFHWImmediate(SVOp), DAG); 6449 } 6450 } 6451 6452 // In case BestHi & BestLo were both -1, which means each quadword has a word 6453 // from each of the four input quadwords, calculate the InOrder bitvector now 6454 // before falling through to the insert/extract cleanup. 6455 if (BestLoQuad == -1 && BestHiQuad == -1) { 6456 NewV = V1; 6457 for (int i = 0; i != 8; ++i) 6458 if (MaskVals[i] < 0 || MaskVals[i] == i) 6459 InOrder.set(i); 6460 } 6461 6462 // The other elements are put in the right place using pextrw and pinsrw. 6463 for (unsigned i = 0; i != 8; ++i) { 6464 if (InOrder[i]) 6465 continue; 6466 int EltIdx = MaskVals[i]; 6467 if (EltIdx < 0) 6468 continue; 6469 SDValue ExtOp = (EltIdx < 8) ? 6470 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 6471 DAG.getIntPtrConstant(EltIdx)) : 6472 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 6473 DAG.getIntPtrConstant(EltIdx - 8)); 6474 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 6475 DAG.getIntPtrConstant(i)); 6476 } 6477 return NewV; 6478} 6479 6480// v16i8 shuffles - Prefer shuffles in the following order: 6481// 1. [ssse3] 1 x pshufb 6482// 2. [ssse3] 2 x pshufb + 1 x por 6483// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 6484static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 6485 const X86Subtarget* Subtarget, 6486 SelectionDAG &DAG) { 6487 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6488 SDValue V1 = SVOp->getOperand(0); 6489 SDValue V2 = SVOp->getOperand(1); 6490 SDLoc dl(SVOp); 6491 ArrayRef<int> MaskVals = SVOp->getMask(); 6492 6493 // Promote splats to a larger type which usually leads to more efficient code. 6494 // FIXME: Is this true if pshufb is available? 6495 if (SVOp->isSplat()) 6496 return PromoteSplat(SVOp, DAG); 6497 6498 // If we have SSSE3, case 1 is generated when all result bytes come from 6499 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 6500 // present, fall back to case 3. 6501 6502 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 6503 if (Subtarget->hasSSSE3()) { 6504 SmallVector<SDValue,16> pshufbMask; 6505 6506 // If all result elements are from one input vector, then only translate 6507 // undef mask values to 0x80 (zero out result) in the pshufb mask. 6508 // 6509 // Otherwise, we have elements from both input vectors, and must zero out 6510 // elements that come from V2 in the first mask, and V1 in the second mask 6511 // so that we can OR them together. 6512 for (unsigned i = 0; i != 16; ++i) { 6513 int EltIdx = MaskVals[i]; 6514 if (EltIdx < 0 || EltIdx >= 16) 6515 EltIdx = 0x80; 6516 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6517 } 6518 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6519 DAG.getNode(ISD::BUILD_VECTOR, dl, 6520 MVT::v16i8, &pshufbMask[0], 16)); 6521 6522 // As PSHUFB will zero elements with negative indices, it's safe to ignore 6523 // the 2nd operand if it's undefined or zero. 6524 if (V2.getOpcode() == ISD::UNDEF || 6525 ISD::isBuildVectorAllZeros(V2.getNode())) 6526 return V1; 6527 6528 // Calculate the shuffle mask for the second input, shuffle it, and 6529 // OR it with the first shuffled input. 6530 pshufbMask.clear(); 6531 for (unsigned i = 0; i != 16; ++i) { 6532 int EltIdx = MaskVals[i]; 6533 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6534 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6535 } 6536 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6537 DAG.getNode(ISD::BUILD_VECTOR, dl, 6538 MVT::v16i8, &pshufbMask[0], 16)); 6539 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6540 } 6541 6542 // No SSSE3 - Calculate in place words and then fix all out of place words 6543 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 6544 // the 16 different words that comprise the two doublequadword input vectors. 6545 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6546 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 6547 SDValue NewV = V1; 6548 for (int i = 0; i != 8; ++i) { 6549 int Elt0 = MaskVals[i*2]; 6550 int Elt1 = MaskVals[i*2+1]; 6551 6552 // This word of the result is all undef, skip it. 6553 if (Elt0 < 0 && Elt1 < 0) 6554 continue; 6555 6556 // This word of the result is already in the correct place, skip it. 6557 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 6558 continue; 6559 6560 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 6561 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 6562 SDValue InsElt; 6563 6564 // If Elt0 and Elt1 are defined, are consecutive, and can be load 6565 // using a single extract together, load it and store it. 6566 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 6567 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6568 DAG.getIntPtrConstant(Elt1 / 2)); 6569 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6570 DAG.getIntPtrConstant(i)); 6571 continue; 6572 } 6573 6574 // If Elt1 is defined, extract it from the appropriate source. If the 6575 // source byte is not also odd, shift the extracted word left 8 bits 6576 // otherwise clear the bottom 8 bits if we need to do an or. 6577 if (Elt1 >= 0) { 6578 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6579 DAG.getIntPtrConstant(Elt1 / 2)); 6580 if ((Elt1 & 1) == 0) 6581 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 6582 DAG.getConstant(8, 6583 TLI.getShiftAmountTy(InsElt.getValueType()))); 6584 else if (Elt0 >= 0) 6585 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 6586 DAG.getConstant(0xFF00, MVT::i16)); 6587 } 6588 // If Elt0 is defined, extract it from the appropriate source. If the 6589 // source byte is not also even, shift the extracted word right 8 bits. If 6590 // Elt1 was also defined, OR the extracted values together before 6591 // inserting them in the result. 6592 if (Elt0 >= 0) { 6593 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 6594 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 6595 if ((Elt0 & 1) != 0) 6596 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 6597 DAG.getConstant(8, 6598 TLI.getShiftAmountTy(InsElt0.getValueType()))); 6599 else if (Elt1 >= 0) 6600 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 6601 DAG.getConstant(0x00FF, MVT::i16)); 6602 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 6603 : InsElt0; 6604 } 6605 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6606 DAG.getIntPtrConstant(i)); 6607 } 6608 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 6609} 6610 6611// v32i8 shuffles - Translate to VPSHUFB if possible. 6612static 6613SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 6614 const X86Subtarget *Subtarget, 6615 SelectionDAG &DAG) { 6616 MVT VT = SVOp->getSimpleValueType(0); 6617 SDValue V1 = SVOp->getOperand(0); 6618 SDValue V2 = SVOp->getOperand(1); 6619 SDLoc dl(SVOp); 6620 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 6621 6622 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6623 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6624 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6625 6626 // VPSHUFB may be generated if 6627 // (1) one of input vector is undefined or zeroinitializer. 6628 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 6629 // And (2) the mask indexes don't cross the 128-bit lane. 6630 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 6631 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 6632 return SDValue(); 6633 6634 if (V1IsAllZero && !V2IsAllZero) { 6635 CommuteVectorShuffleMask(MaskVals, 32); 6636 V1 = V2; 6637 } 6638 SmallVector<SDValue, 32> pshufbMask; 6639 for (unsigned i = 0; i != 32; i++) { 6640 int EltIdx = MaskVals[i]; 6641 if (EltIdx < 0 || EltIdx >= 32) 6642 EltIdx = 0x80; 6643 else { 6644 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 6645 // Cross lane is not allowed. 6646 return SDValue(); 6647 EltIdx &= 0xf; 6648 } 6649 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6650 } 6651 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6652 DAG.getNode(ISD::BUILD_VECTOR, dl, 6653 MVT::v32i8, &pshufbMask[0], 32)); 6654} 6655 6656/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6657/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6658/// done when every pair / quad of shuffle mask elements point to elements in 6659/// the right sequence. e.g. 6660/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6661static 6662SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6663 SelectionDAG &DAG) { 6664 MVT VT = SVOp->getSimpleValueType(0); 6665 SDLoc dl(SVOp); 6666 unsigned NumElems = VT.getVectorNumElements(); 6667 MVT NewVT; 6668 unsigned Scale; 6669 switch (VT.SimpleTy) { 6670 default: llvm_unreachable("Unexpected!"); 6671 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6672 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6673 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6674 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6675 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6676 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6677 } 6678 6679 SmallVector<int, 8> MaskVec; 6680 for (unsigned i = 0; i != NumElems; i += Scale) { 6681 int StartIdx = -1; 6682 for (unsigned j = 0; j != Scale; ++j) { 6683 int EltIdx = SVOp->getMaskElt(i+j); 6684 if (EltIdx < 0) 6685 continue; 6686 if (StartIdx < 0) 6687 StartIdx = (EltIdx / Scale); 6688 if (EltIdx != (int)(StartIdx*Scale + j)) 6689 return SDValue(); 6690 } 6691 MaskVec.push_back(StartIdx); 6692 } 6693 6694 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6695 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6696 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6697} 6698 6699/// getVZextMovL - Return a zero-extending vector move low node. 6700/// 6701static SDValue getVZextMovL(MVT VT, MVT OpVT, 6702 SDValue SrcOp, SelectionDAG &DAG, 6703 const X86Subtarget *Subtarget, SDLoc dl) { 6704 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6705 LoadSDNode *LD = NULL; 6706 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6707 LD = dyn_cast<LoadSDNode>(SrcOp); 6708 if (!LD) { 6709 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6710 // instead. 6711 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6712 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6713 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6714 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6715 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6716 // PR2108 6717 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6718 return DAG.getNode(ISD::BITCAST, dl, VT, 6719 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6720 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6721 OpVT, 6722 SrcOp.getOperand(0) 6723 .getOperand(0)))); 6724 } 6725 } 6726 } 6727 6728 return DAG.getNode(ISD::BITCAST, dl, VT, 6729 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6730 DAG.getNode(ISD::BITCAST, dl, 6731 OpVT, SrcOp))); 6732} 6733 6734/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6735/// which could not be matched by any known target speficic shuffle 6736static SDValue 6737LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6738 6739 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6740 if (NewOp.getNode()) 6741 return NewOp; 6742 6743 MVT VT = SVOp->getSimpleValueType(0); 6744 6745 unsigned NumElems = VT.getVectorNumElements(); 6746 unsigned NumLaneElems = NumElems / 2; 6747 6748 SDLoc dl(SVOp); 6749 MVT EltVT = VT.getVectorElementType(); 6750 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6751 SDValue Output[2]; 6752 6753 SmallVector<int, 16> Mask; 6754 for (unsigned l = 0; l < 2; ++l) { 6755 // Build a shuffle mask for the output, discovering on the fly which 6756 // input vectors to use as shuffle operands (recorded in InputUsed). 6757 // If building a suitable shuffle vector proves too hard, then bail 6758 // out with UseBuildVector set. 6759 bool UseBuildVector = false; 6760 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6761 unsigned LaneStart = l * NumLaneElems; 6762 for (unsigned i = 0; i != NumLaneElems; ++i) { 6763 // The mask element. This indexes into the input. 6764 int Idx = SVOp->getMaskElt(i+LaneStart); 6765 if (Idx < 0) { 6766 // the mask element does not index into any input vector. 6767 Mask.push_back(-1); 6768 continue; 6769 } 6770 6771 // The input vector this mask element indexes into. 6772 int Input = Idx / NumLaneElems; 6773 6774 // Turn the index into an offset from the start of the input vector. 6775 Idx -= Input * NumLaneElems; 6776 6777 // Find or create a shuffle vector operand to hold this input. 6778 unsigned OpNo; 6779 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6780 if (InputUsed[OpNo] == Input) 6781 // This input vector is already an operand. 6782 break; 6783 if (InputUsed[OpNo] < 0) { 6784 // Create a new operand for this input vector. 6785 InputUsed[OpNo] = Input; 6786 break; 6787 } 6788 } 6789 6790 if (OpNo >= array_lengthof(InputUsed)) { 6791 // More than two input vectors used! Give up on trying to create a 6792 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6793 UseBuildVector = true; 6794 break; 6795 } 6796 6797 // Add the mask index for the new shuffle vector. 6798 Mask.push_back(Idx + OpNo * NumLaneElems); 6799 } 6800 6801 if (UseBuildVector) { 6802 SmallVector<SDValue, 16> SVOps; 6803 for (unsigned i = 0; i != NumLaneElems; ++i) { 6804 // The mask element. This indexes into the input. 6805 int Idx = SVOp->getMaskElt(i+LaneStart); 6806 if (Idx < 0) { 6807 SVOps.push_back(DAG.getUNDEF(EltVT)); 6808 continue; 6809 } 6810 6811 // The input vector this mask element indexes into. 6812 int Input = Idx / NumElems; 6813 6814 // Turn the index into an offset from the start of the input vector. 6815 Idx -= Input * NumElems; 6816 6817 // Extract the vector element by hand. 6818 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6819 SVOp->getOperand(Input), 6820 DAG.getIntPtrConstant(Idx))); 6821 } 6822 6823 // Construct the output using a BUILD_VECTOR. 6824 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6825 SVOps.size()); 6826 } else if (InputUsed[0] < 0) { 6827 // No input vectors were used! The result is undefined. 6828 Output[l] = DAG.getUNDEF(NVT); 6829 } else { 6830 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6831 (InputUsed[0] % 2) * NumLaneElems, 6832 DAG, dl); 6833 // If only one input was used, use an undefined vector for the other. 6834 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6835 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6836 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6837 // At least one input vector was used. Create a new shuffle vector. 6838 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6839 } 6840 6841 Mask.clear(); 6842 } 6843 6844 // Concatenate the result back 6845 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6846} 6847 6848/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6849/// 4 elements, and match them with several different shuffle types. 6850static SDValue 6851LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6852 SDValue V1 = SVOp->getOperand(0); 6853 SDValue V2 = SVOp->getOperand(1); 6854 SDLoc dl(SVOp); 6855 MVT VT = SVOp->getSimpleValueType(0); 6856 6857 assert(VT.is128BitVector() && "Unsupported vector size"); 6858 6859 std::pair<int, int> Locs[4]; 6860 int Mask1[] = { -1, -1, -1, -1 }; 6861 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6862 6863 unsigned NumHi = 0; 6864 unsigned NumLo = 0; 6865 for (unsigned i = 0; i != 4; ++i) { 6866 int Idx = PermMask[i]; 6867 if (Idx < 0) { 6868 Locs[i] = std::make_pair(-1, -1); 6869 } else { 6870 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6871 if (Idx < 4) { 6872 Locs[i] = std::make_pair(0, NumLo); 6873 Mask1[NumLo] = Idx; 6874 NumLo++; 6875 } else { 6876 Locs[i] = std::make_pair(1, NumHi); 6877 if (2+NumHi < 4) 6878 Mask1[2+NumHi] = Idx; 6879 NumHi++; 6880 } 6881 } 6882 } 6883 6884 if (NumLo <= 2 && NumHi <= 2) { 6885 // If no more than two elements come from either vector. This can be 6886 // implemented with two shuffles. First shuffle gather the elements. 6887 // The second shuffle, which takes the first shuffle as both of its 6888 // vector operands, put the elements into the right order. 6889 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6890 6891 int Mask2[] = { -1, -1, -1, -1 }; 6892 6893 for (unsigned i = 0; i != 4; ++i) 6894 if (Locs[i].first != -1) { 6895 unsigned Idx = (i < 2) ? 0 : 4; 6896 Idx += Locs[i].first * 2 + Locs[i].second; 6897 Mask2[i] = Idx; 6898 } 6899 6900 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6901 } 6902 6903 if (NumLo == 3 || NumHi == 3) { 6904 // Otherwise, we must have three elements from one vector, call it X, and 6905 // one element from the other, call it Y. First, use a shufps to build an 6906 // intermediate vector with the one element from Y and the element from X 6907 // that will be in the same half in the final destination (the indexes don't 6908 // matter). Then, use a shufps to build the final vector, taking the half 6909 // containing the element from Y from the intermediate, and the other half 6910 // from X. 6911 if (NumHi == 3) { 6912 // Normalize it so the 3 elements come from V1. 6913 CommuteVectorShuffleMask(PermMask, 4); 6914 std::swap(V1, V2); 6915 } 6916 6917 // Find the element from V2. 6918 unsigned HiIndex; 6919 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6920 int Val = PermMask[HiIndex]; 6921 if (Val < 0) 6922 continue; 6923 if (Val >= 4) 6924 break; 6925 } 6926 6927 Mask1[0] = PermMask[HiIndex]; 6928 Mask1[1] = -1; 6929 Mask1[2] = PermMask[HiIndex^1]; 6930 Mask1[3] = -1; 6931 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6932 6933 if (HiIndex >= 2) { 6934 Mask1[0] = PermMask[0]; 6935 Mask1[1] = PermMask[1]; 6936 Mask1[2] = HiIndex & 1 ? 6 : 4; 6937 Mask1[3] = HiIndex & 1 ? 4 : 6; 6938 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6939 } 6940 6941 Mask1[0] = HiIndex & 1 ? 2 : 0; 6942 Mask1[1] = HiIndex & 1 ? 0 : 2; 6943 Mask1[2] = PermMask[2]; 6944 Mask1[3] = PermMask[3]; 6945 if (Mask1[2] >= 0) 6946 Mask1[2] += 4; 6947 if (Mask1[3] >= 0) 6948 Mask1[3] += 4; 6949 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6950 } 6951 6952 // Break it into (shuffle shuffle_hi, shuffle_lo). 6953 int LoMask[] = { -1, -1, -1, -1 }; 6954 int HiMask[] = { -1, -1, -1, -1 }; 6955 6956 int *MaskPtr = LoMask; 6957 unsigned MaskIdx = 0; 6958 unsigned LoIdx = 0; 6959 unsigned HiIdx = 2; 6960 for (unsigned i = 0; i != 4; ++i) { 6961 if (i == 2) { 6962 MaskPtr = HiMask; 6963 MaskIdx = 1; 6964 LoIdx = 0; 6965 HiIdx = 2; 6966 } 6967 int Idx = PermMask[i]; 6968 if (Idx < 0) { 6969 Locs[i] = std::make_pair(-1, -1); 6970 } else if (Idx < 4) { 6971 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6972 MaskPtr[LoIdx] = Idx; 6973 LoIdx++; 6974 } else { 6975 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6976 MaskPtr[HiIdx] = Idx; 6977 HiIdx++; 6978 } 6979 } 6980 6981 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6982 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6983 int MaskOps[] = { -1, -1, -1, -1 }; 6984 for (unsigned i = 0; i != 4; ++i) 6985 if (Locs[i].first != -1) 6986 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6987 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6988} 6989 6990static bool MayFoldVectorLoad(SDValue V) { 6991 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6992 V = V.getOperand(0); 6993 6994 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6995 V = V.getOperand(0); 6996 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6997 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6998 // BUILD_VECTOR (load), undef 6999 V = V.getOperand(0); 7000 7001 return MayFoldLoad(V); 7002} 7003 7004static 7005SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { 7006 MVT VT = Op.getSimpleValueType(); 7007 7008 // Canonizalize to v2f64. 7009 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 7010 return DAG.getNode(ISD::BITCAST, dl, VT, 7011 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 7012 V1, DAG)); 7013} 7014 7015static 7016SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, 7017 bool HasSSE2) { 7018 SDValue V1 = Op.getOperand(0); 7019 SDValue V2 = Op.getOperand(1); 7020 MVT VT = Op.getSimpleValueType(); 7021 7022 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 7023 7024 if (HasSSE2 && VT == MVT::v2f64) 7025 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 7026 7027 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 7028 return DAG.getNode(ISD::BITCAST, dl, VT, 7029 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 7030 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 7031 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 7032} 7033 7034static 7035SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { 7036 SDValue V1 = Op.getOperand(0); 7037 SDValue V2 = Op.getOperand(1); 7038 MVT VT = Op.getSimpleValueType(); 7039 7040 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 7041 "unsupported shuffle type"); 7042 7043 if (V2.getOpcode() == ISD::UNDEF) 7044 V2 = V1; 7045 7046 // v4i32 or v4f32 7047 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 7048} 7049 7050static 7051SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 7052 SDValue V1 = Op.getOperand(0); 7053 SDValue V2 = Op.getOperand(1); 7054 MVT VT = Op.getSimpleValueType(); 7055 unsigned NumElems = VT.getVectorNumElements(); 7056 7057 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 7058 // operand of these instructions is only memory, so check if there's a 7059 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 7060 // same masks. 7061 bool CanFoldLoad = false; 7062 7063 // Trivial case, when V2 comes from a load. 7064 if (MayFoldVectorLoad(V2)) 7065 CanFoldLoad = true; 7066 7067 // When V1 is a load, it can be folded later into a store in isel, example: 7068 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 7069 // turns into: 7070 // (MOVLPSmr addr:$src1, VR128:$src2) 7071 // So, recognize this potential and also use MOVLPS or MOVLPD 7072 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 7073 CanFoldLoad = true; 7074 7075 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7076 if (CanFoldLoad) { 7077 if (HasSSE2 && NumElems == 2) 7078 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 7079 7080 if (NumElems == 4) 7081 // If we don't care about the second element, proceed to use movss. 7082 if (SVOp->getMaskElt(1) != -1) 7083 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 7084 } 7085 7086 // movl and movlp will both match v2i64, but v2i64 is never matched by 7087 // movl earlier because we make it strict to avoid messing with the movlp load 7088 // folding logic (see the code above getMOVLP call). Match it here then, 7089 // this is horrible, but will stay like this until we move all shuffle 7090 // matching to x86 specific nodes. Note that for the 1st condition all 7091 // types are matched with movsd. 7092 if (HasSSE2) { 7093 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 7094 // as to remove this logic from here, as much as possible 7095 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 7096 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7097 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7098 } 7099 7100 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 7101 7102 // Invert the operand order and use SHUFPS to match it. 7103 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 7104 getShuffleSHUFImmediate(SVOp), DAG); 7105} 7106 7107// Reduce a vector shuffle to zext. 7108static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, 7109 SelectionDAG &DAG) { 7110 // PMOVZX is only available from SSE41. 7111 if (!Subtarget->hasSSE41()) 7112 return SDValue(); 7113 7114 MVT VT = Op.getSimpleValueType(); 7115 7116 // Only AVX2 support 256-bit vector integer extending. 7117 if (!Subtarget->hasInt256() && VT.is256BitVector()) 7118 return SDValue(); 7119 7120 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7121 SDLoc DL(Op); 7122 SDValue V1 = Op.getOperand(0); 7123 SDValue V2 = Op.getOperand(1); 7124 unsigned NumElems = VT.getVectorNumElements(); 7125 7126 // Extending is an unary operation and the element type of the source vector 7127 // won't be equal to or larger than i64. 7128 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 7129 VT.getVectorElementType() == MVT::i64) 7130 return SDValue(); 7131 7132 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 7133 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 7134 while ((1U << Shift) < NumElems) { 7135 if (SVOp->getMaskElt(1U << Shift) == 1) 7136 break; 7137 Shift += 1; 7138 // The maximal ratio is 8, i.e. from i8 to i64. 7139 if (Shift > 3) 7140 return SDValue(); 7141 } 7142 7143 // Check the shuffle mask. 7144 unsigned Mask = (1U << Shift) - 1; 7145 for (unsigned i = 0; i != NumElems; ++i) { 7146 int EltIdx = SVOp->getMaskElt(i); 7147 if ((i & Mask) != 0 && EltIdx != -1) 7148 return SDValue(); 7149 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 7150 return SDValue(); 7151 } 7152 7153 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 7154 MVT NeVT = MVT::getIntegerVT(NBits); 7155 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); 7156 7157 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) 7158 return SDValue(); 7159 7160 // Simplify the operand as it's prepared to be fed into shuffle. 7161 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 7162 if (V1.getOpcode() == ISD::BITCAST && 7163 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 7164 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7165 V1.getOperand(0).getOperand(0) 7166 .getSimpleValueType().getSizeInBits() == SignificantBits) { 7167 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 7168 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 7169 ConstantSDNode *CIdx = 7170 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 7171 // If it's foldable, i.e. normal load with single use, we will let code 7172 // selection to fold it. Otherwise, we will short the conversion sequence. 7173 if (CIdx && CIdx->getZExtValue() == 0 && 7174 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { 7175 MVT FullVT = V.getSimpleValueType(); 7176 MVT V1VT = V1.getSimpleValueType(); 7177 if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { 7178 // The "ext_vec_elt" node is wider than the result node. 7179 // In this case we should extract subvector from V. 7180 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). 7181 unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); 7182 MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), 7183 FullVT.getVectorNumElements()/Ratio); 7184 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 7185 DAG.getIntPtrConstant(0)); 7186 } 7187 V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); 7188 } 7189 } 7190 7191 return DAG.getNode(ISD::BITCAST, DL, VT, 7192 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 7193} 7194 7195static SDValue 7196NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 7197 SelectionDAG &DAG) { 7198 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7199 MVT VT = Op.getSimpleValueType(); 7200 SDLoc dl(Op); 7201 SDValue V1 = Op.getOperand(0); 7202 SDValue V2 = Op.getOperand(1); 7203 7204 if (isZeroShuffle(SVOp)) 7205 return getZeroVector(VT, Subtarget, DAG, dl); 7206 7207 // Handle splat operations 7208 if (SVOp->isSplat()) { 7209 // Use vbroadcast whenever the splat comes from a foldable load 7210 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 7211 if (Broadcast.getNode()) 7212 return Broadcast; 7213 } 7214 7215 // Check integer expanding shuffles. 7216 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); 7217 if (NewOp.getNode()) 7218 return NewOp; 7219 7220 // If the shuffle can be profitably rewritten as a narrower shuffle, then 7221 // do it! 7222 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 7223 VT == MVT::v16i16 || VT == MVT::v32i8) { 7224 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7225 if (NewOp.getNode()) 7226 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 7227 } else if ((VT == MVT::v4i32 || 7228 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 7229 // FIXME: Figure out a cleaner way to do this. 7230 // Try to make use of movq to zero out the top part. 7231 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 7232 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7233 if (NewOp.getNode()) { 7234 MVT NewVT = NewOp.getSimpleValueType(); 7235 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 7236 NewVT, true, false)) 7237 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 7238 DAG, Subtarget, dl); 7239 } 7240 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 7241 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7242 if (NewOp.getNode()) { 7243 MVT NewVT = NewOp.getSimpleValueType(); 7244 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 7245 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 7246 DAG, Subtarget, dl); 7247 } 7248 } 7249 } 7250 return SDValue(); 7251} 7252 7253SDValue 7254X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 7255 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7256 SDValue V1 = Op.getOperand(0); 7257 SDValue V2 = Op.getOperand(1); 7258 MVT VT = Op.getSimpleValueType(); 7259 SDLoc dl(Op); 7260 unsigned NumElems = VT.getVectorNumElements(); 7261 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 7262 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 7263 bool V1IsSplat = false; 7264 bool V2IsSplat = false; 7265 bool HasSSE2 = Subtarget->hasSSE2(); 7266 bool HasFp256 = Subtarget->hasFp256(); 7267 bool HasInt256 = Subtarget->hasInt256(); 7268 MachineFunction &MF = DAG.getMachineFunction(); 7269 bool OptForSize = MF.getFunction()->getAttributes(). 7270 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 7271 7272 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 7273 7274 if (V1IsUndef && V2IsUndef) 7275 return DAG.getUNDEF(VT); 7276 7277 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 7278 7279 // Vector shuffle lowering takes 3 steps: 7280 // 7281 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 7282 // narrowing and commutation of operands should be handled. 7283 // 2) Matching of shuffles with known shuffle masks to x86 target specific 7284 // shuffle nodes. 7285 // 3) Rewriting of unmatched masks into new generic shuffle operations, 7286 // so the shuffle can be broken into other shuffles and the legalizer can 7287 // try the lowering again. 7288 // 7289 // The general idea is that no vector_shuffle operation should be left to 7290 // be matched during isel, all of them must be converted to a target specific 7291 // node here. 7292 7293 // Normalize the input vectors. Here splats, zeroed vectors, profitable 7294 // narrowing and commutation of operands should be handled. The actual code 7295 // doesn't include all of those, work in progress... 7296 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); 7297 if (NewOp.getNode()) 7298 return NewOp; 7299 7300 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 7301 7302 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 7303 // unpckh_undef). Only use pshufd if speed is more important than size. 7304 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7305 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7306 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7307 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7308 7309 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 7310 V2IsUndef && MayFoldVectorLoad(V1)) 7311 return getMOVDDup(Op, dl, V1, DAG); 7312 7313 if (isMOVHLPS_v_undef_Mask(M, VT)) 7314 return getMOVHighToLow(Op, dl, DAG); 7315 7316 // Use to match splats 7317 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 7318 (VT == MVT::v2f64 || VT == MVT::v2i64)) 7319 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7320 7321 if (isPSHUFDMask(M, VT)) { 7322 // The actual implementation will match the mask in the if above and then 7323 // during isel it can match several different instructions, not only pshufd 7324 // as its name says, sad but true, emulate the behavior for now... 7325 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 7326 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 7327 7328 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 7329 7330 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 7331 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 7332 7333 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 7334 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 7335 DAG); 7336 7337 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 7338 TargetMask, DAG); 7339 } 7340 7341 if (isPALIGNRMask(M, VT, Subtarget)) 7342 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, 7343 getShufflePALIGNRImmediate(SVOp), 7344 DAG); 7345 7346 // Check if this can be converted into a logical shift. 7347 bool isLeft = false; 7348 unsigned ShAmt = 0; 7349 SDValue ShVal; 7350 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 7351 if (isShift && ShVal.hasOneUse()) { 7352 // If the shifted value has multiple uses, it may be cheaper to use 7353 // v_set0 + movlhps or movhlps, etc. 7354 MVT EltVT = VT.getVectorElementType(); 7355 ShAmt *= EltVT.getSizeInBits(); 7356 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7357 } 7358 7359 if (isMOVLMask(M, VT)) { 7360 if (ISD::isBuildVectorAllZeros(V1.getNode())) 7361 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 7362 if (!isMOVLPMask(M, VT)) { 7363 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 7364 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7365 7366 if (VT == MVT::v4i32 || VT == MVT::v4f32) 7367 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7368 } 7369 } 7370 7371 // FIXME: fold these into legal mask. 7372 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 7373 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 7374 7375 if (isMOVHLPSMask(M, VT)) 7376 return getMOVHighToLow(Op, dl, DAG); 7377 7378 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 7379 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 7380 7381 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 7382 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 7383 7384 if (isMOVLPMask(M, VT)) 7385 return getMOVLP(Op, dl, DAG, HasSSE2); 7386 7387 if (ShouldXformToMOVHLPS(M, VT) || 7388 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 7389 return CommuteVectorShuffle(SVOp, DAG); 7390 7391 if (isShift) { 7392 // No better options. Use a vshldq / vsrldq. 7393 MVT EltVT = VT.getVectorElementType(); 7394 ShAmt *= EltVT.getSizeInBits(); 7395 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7396 } 7397 7398 bool Commuted = false; 7399 // FIXME: This should also accept a bitcast of a splat? Be careful, not 7400 // 1,1,1,1 -> v8i16 though. 7401 V1IsSplat = isSplatVector(V1.getNode()); 7402 V2IsSplat = isSplatVector(V2.getNode()); 7403 7404 // Canonicalize the splat or undef, if present, to be on the RHS. 7405 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 7406 CommuteVectorShuffleMask(M, NumElems); 7407 std::swap(V1, V2); 7408 std::swap(V1IsSplat, V2IsSplat); 7409 Commuted = true; 7410 } 7411 7412 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 7413 // Shuffling low element of v1 into undef, just return v1. 7414 if (V2IsUndef) 7415 return V1; 7416 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 7417 // the instruction selector will not match, so get a canonical MOVL with 7418 // swapped operands to undo the commute. 7419 return getMOVL(DAG, dl, VT, V2, V1); 7420 } 7421 7422 if (isUNPCKLMask(M, VT, HasInt256)) 7423 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7424 7425 if (isUNPCKHMask(M, VT, HasInt256)) 7426 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7427 7428 if (V2IsSplat) { 7429 // Normalize mask so all entries that point to V2 points to its first 7430 // element then try to match unpck{h|l} again. If match, return a 7431 // new vector_shuffle with the corrected mask.p 7432 SmallVector<int, 8> NewMask(M.begin(), M.end()); 7433 NormalizeMask(NewMask, NumElems); 7434 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 7435 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7436 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 7437 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7438 } 7439 7440 if (Commuted) { 7441 // Commute is back and try unpck* again. 7442 // FIXME: this seems wrong. 7443 CommuteVectorShuffleMask(M, NumElems); 7444 std::swap(V1, V2); 7445 std::swap(V1IsSplat, V2IsSplat); 7446 Commuted = false; 7447 7448 if (isUNPCKLMask(M, VT, HasInt256)) 7449 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7450 7451 if (isUNPCKHMask(M, VT, HasInt256)) 7452 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7453 } 7454 7455 // Normalize the node to match x86 shuffle ops if needed 7456 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) 7457 return CommuteVectorShuffle(SVOp, DAG); 7458 7459 // The checks below are all present in isShuffleMaskLegal, but they are 7460 // inlined here right now to enable us to directly emit target specific 7461 // nodes, and remove one by one until they don't return Op anymore. 7462 7463 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 7464 SVOp->getSplatIndex() == 0 && V2IsUndef) { 7465 if (VT == MVT::v2f64 || VT == MVT::v2i64) 7466 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7467 } 7468 7469 if (isPSHUFHWMask(M, VT, HasInt256)) 7470 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 7471 getShufflePSHUFHWImmediate(SVOp), 7472 DAG); 7473 7474 if (isPSHUFLWMask(M, VT, HasInt256)) 7475 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 7476 getShufflePSHUFLWImmediate(SVOp), 7477 DAG); 7478 7479 if (isSHUFPMask(M, VT)) 7480 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 7481 getShuffleSHUFImmediate(SVOp), DAG); 7482 7483 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7484 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7485 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7486 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7487 7488 //===--------------------------------------------------------------------===// 7489 // Generate target specific nodes for 128 or 256-bit shuffles only 7490 // supported in the AVX instruction set. 7491 // 7492 7493 // Handle VMOVDDUPY permutations 7494 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 7495 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 7496 7497 // Handle VPERMILPS/D* permutations 7498 if (isVPERMILPMask(M, VT)) { 7499 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) 7500 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 7501 getShuffleSHUFImmediate(SVOp), DAG); 7502 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 7503 getShuffleSHUFImmediate(SVOp), DAG); 7504 } 7505 7506 // Handle VPERM2F128/VPERM2I128 permutations 7507 if (isVPERM2X128Mask(M, VT, HasFp256)) 7508 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 7509 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 7510 7511 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 7512 if (BlendOp.getNode()) 7513 return BlendOp; 7514 7515 unsigned Imm8; 7516 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) 7517 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); 7518 7519 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || 7520 VT.is512BitVector()) { 7521 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); 7522 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); 7523 SmallVector<SDValue, 16> permclMask; 7524 for (unsigned i = 0; i != NumElems; ++i) { 7525 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); 7526 } 7527 7528 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, 7529 &permclMask[0], NumElems); 7530 if (V2IsUndef) 7531 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 7532 return DAG.getNode(X86ISD::VPERMV, dl, VT, 7533 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 7534 return DAG.getNode(X86ISD::VPERMV3, dl, VT, 7535 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); 7536 } 7537 7538 //===--------------------------------------------------------------------===// 7539 // Since no target specific shuffle was selected for this generic one, 7540 // lower it into other known shuffles. FIXME: this isn't true yet, but 7541 // this is the plan. 7542 // 7543 7544 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 7545 if (VT == MVT::v8i16) { 7546 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 7547 if (NewOp.getNode()) 7548 return NewOp; 7549 } 7550 7551 if (VT == MVT::v16i8) { 7552 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); 7553 if (NewOp.getNode()) 7554 return NewOp; 7555 } 7556 7557 if (VT == MVT::v32i8) { 7558 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 7559 if (NewOp.getNode()) 7560 return NewOp; 7561 } 7562 7563 // Handle all 128-bit wide vectors with 4 elements, and match them with 7564 // several different shuffle types. 7565 if (NumElems == 4 && VT.is128BitVector()) 7566 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 7567 7568 // Handle general 256-bit shuffles 7569 if (VT.is256BitVector()) 7570 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 7571 7572 return SDValue(); 7573} 7574 7575static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7576 MVT VT = Op.getSimpleValueType(); 7577 SDLoc dl(Op); 7578 7579 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 7580 return SDValue(); 7581 7582 if (VT.getSizeInBits() == 8) { 7583 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 7584 Op.getOperand(0), Op.getOperand(1)); 7585 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7586 DAG.getValueType(VT)); 7587 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7588 } 7589 7590 if (VT.getSizeInBits() == 16) { 7591 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7592 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 7593 if (Idx == 0) 7594 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7595 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7596 DAG.getNode(ISD::BITCAST, dl, 7597 MVT::v4i32, 7598 Op.getOperand(0)), 7599 Op.getOperand(1))); 7600 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 7601 Op.getOperand(0), Op.getOperand(1)); 7602 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7603 DAG.getValueType(VT)); 7604 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7605 } 7606 7607 if (VT == MVT::f32) { 7608 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 7609 // the result back to FR32 register. It's only worth matching if the 7610 // result has a single use which is a store or a bitcast to i32. And in 7611 // the case of a store, it's not worth it if the index is a constant 0, 7612 // because a MOVSSmr can be used instead, which is smaller and faster. 7613 if (!Op.hasOneUse()) 7614 return SDValue(); 7615 SDNode *User = *Op.getNode()->use_begin(); 7616 if ((User->getOpcode() != ISD::STORE || 7617 (isa<ConstantSDNode>(Op.getOperand(1)) && 7618 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 7619 (User->getOpcode() != ISD::BITCAST || 7620 User->getValueType(0) != MVT::i32)) 7621 return SDValue(); 7622 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7623 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 7624 Op.getOperand(0)), 7625 Op.getOperand(1)); 7626 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 7627 } 7628 7629 if (VT == MVT::i32 || VT == MVT::i64) { 7630 // ExtractPS/pextrq works with constant index. 7631 if (isa<ConstantSDNode>(Op.getOperand(1))) 7632 return Op; 7633 } 7634 return SDValue(); 7635} 7636 7637SDValue 7638X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7639 SelectionDAG &DAG) const { 7640 SDLoc dl(Op); 7641 SDValue Vec = Op.getOperand(0); 7642 MVT VecVT = Vec.getSimpleValueType(); 7643 SDValue Idx = Op.getOperand(1); 7644 if (!isa<ConstantSDNode>(Idx)) { 7645 if (VecVT.is512BitVector() || 7646 (VecVT.is256BitVector() && Subtarget->hasInt256() && 7647 VecVT.getVectorElementType().getSizeInBits() == 32)) { 7648 7649 MVT MaskEltVT = 7650 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 7651 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 7652 MaskEltVT.getSizeInBits()); 7653 7654 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 7655 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 7656 getZeroVector(MaskVT, Subtarget, DAG, dl), 7657 Idx, DAG.getConstant(0, getPointerTy())); 7658 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 7659 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), 7660 Perm, DAG.getConstant(0, getPointerTy())); 7661 } 7662 return SDValue(); 7663 } 7664 7665 // If this is a 256-bit vector result, first extract the 128-bit vector and 7666 // then extract the element from the 128-bit vector. 7667 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 7668 7669 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7670 // Get the 128-bit vector. 7671 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 7672 MVT EltVT = VecVT.getVectorElementType(); 7673 7674 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 7675 7676 //if (IdxVal >= NumElems/2) 7677 // IdxVal -= NumElems/2; 7678 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; 7679 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 7680 DAG.getConstant(IdxVal, MVT::i32)); 7681 } 7682 7683 assert(VecVT.is128BitVector() && "Unexpected vector length"); 7684 7685 if (Subtarget->hasSSE41()) { 7686 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 7687 if (Res.getNode()) 7688 return Res; 7689 } 7690 7691 MVT VT = Op.getSimpleValueType(); 7692 // TODO: handle v16i8. 7693 if (VT.getSizeInBits() == 16) { 7694 SDValue Vec = Op.getOperand(0); 7695 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7696 if (Idx == 0) 7697 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7698 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7699 DAG.getNode(ISD::BITCAST, dl, 7700 MVT::v4i32, Vec), 7701 Op.getOperand(1))); 7702 // Transform it so it match pextrw which produces a 32-bit result. 7703 MVT EltVT = MVT::i32; 7704 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 7705 Op.getOperand(0), Op.getOperand(1)); 7706 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 7707 DAG.getValueType(VT)); 7708 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7709 } 7710 7711 if (VT.getSizeInBits() == 32) { 7712 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7713 if (Idx == 0) 7714 return Op; 7715 7716 // SHUFPS the element to the lowest double word, then movss. 7717 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 7718 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7719 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7720 DAG.getUNDEF(VVT), Mask); 7721 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7722 DAG.getIntPtrConstant(0)); 7723 } 7724 7725 if (VT.getSizeInBits() == 64) { 7726 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 7727 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 7728 // to match extract_elt for f64. 7729 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7730 if (Idx == 0) 7731 return Op; 7732 7733 // UNPCKHPD the element to the lowest double word, then movsd. 7734 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 7735 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 7736 int Mask[2] = { 1, -1 }; 7737 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7738 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7739 DAG.getUNDEF(VVT), Mask); 7740 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7741 DAG.getIntPtrConstant(0)); 7742 } 7743 7744 return SDValue(); 7745} 7746 7747static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7748 MVT VT = Op.getSimpleValueType(); 7749 MVT EltVT = VT.getVectorElementType(); 7750 SDLoc dl(Op); 7751 7752 SDValue N0 = Op.getOperand(0); 7753 SDValue N1 = Op.getOperand(1); 7754 SDValue N2 = Op.getOperand(2); 7755 7756 if (!VT.is128BitVector()) 7757 return SDValue(); 7758 7759 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7760 isa<ConstantSDNode>(N2)) { 7761 unsigned Opc; 7762 if (VT == MVT::v8i16) 7763 Opc = X86ISD::PINSRW; 7764 else if (VT == MVT::v16i8) 7765 Opc = X86ISD::PINSRB; 7766 else 7767 Opc = X86ISD::PINSRB; 7768 7769 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7770 // argument. 7771 if (N1.getValueType() != MVT::i32) 7772 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7773 if (N2.getValueType() != MVT::i32) 7774 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7775 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7776 } 7777 7778 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7779 // Bits [7:6] of the constant are the source select. This will always be 7780 // zero here. The DAG Combiner may combine an extract_elt index into these 7781 // bits. For example (insert (extract, 3), 2) could be matched by putting 7782 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7783 // Bits [5:4] of the constant are the destination select. This is the 7784 // value of the incoming immediate. 7785 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7786 // combine either bitwise AND or insert of float 0.0 to set these bits. 7787 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7788 // Create this as a scalar to vector.. 7789 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7790 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7791 } 7792 7793 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7794 // PINSR* works with constant index. 7795 return Op; 7796 } 7797 return SDValue(); 7798} 7799 7800SDValue 7801X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7802 MVT VT = Op.getSimpleValueType(); 7803 MVT EltVT = VT.getVectorElementType(); 7804 7805 SDLoc dl(Op); 7806 SDValue N0 = Op.getOperand(0); 7807 SDValue N1 = Op.getOperand(1); 7808 SDValue N2 = Op.getOperand(2); 7809 7810 // If this is a 256-bit vector result, first extract the 128-bit vector, 7811 // insert the element into the extracted half and then place it back. 7812 if (VT.is256BitVector() || VT.is512BitVector()) { 7813 if (!isa<ConstantSDNode>(N2)) 7814 return SDValue(); 7815 7816 // Get the desired 128-bit vector half. 7817 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7818 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7819 7820 // Insert the element into the desired half. 7821 unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); 7822 unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; 7823 7824 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7825 DAG.getConstant(IdxIn128, MVT::i32)); 7826 7827 // Insert the changed part back to the 256-bit vector 7828 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7829 } 7830 7831 if (Subtarget->hasSSE41()) 7832 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7833 7834 if (EltVT == MVT::i8) 7835 return SDValue(); 7836 7837 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7838 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7839 // as its second argument. 7840 if (N1.getValueType() != MVT::i32) 7841 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7842 if (N2.getValueType() != MVT::i32) 7843 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7844 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7845 } 7846 return SDValue(); 7847} 7848 7849static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7850 SDLoc dl(Op); 7851 MVT OpVT = Op.getSimpleValueType(); 7852 7853 // If this is a 256-bit vector result, first insert into a 128-bit 7854 // vector and then insert into the 256-bit vector. 7855 if (!OpVT.is128BitVector()) { 7856 // Insert into a 128-bit vector. 7857 unsigned SizeFactor = OpVT.getSizeInBits()/128; 7858 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 7859 OpVT.getVectorNumElements() / SizeFactor); 7860 7861 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7862 7863 // Insert the 128-bit vector. 7864 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7865 } 7866 7867 if (OpVT == MVT::v1i64 && 7868 Op.getOperand(0).getValueType() == MVT::i64) 7869 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7870 7871 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7872 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7873 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7874 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7875} 7876 7877// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7878// a simple subregister reference or explicit instructions to grab 7879// upper bits of a vector. 7880static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7881 SelectionDAG &DAG) { 7882 SDLoc dl(Op); 7883 SDValue In = Op.getOperand(0); 7884 SDValue Idx = Op.getOperand(1); 7885 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7886 MVT ResVT = Op.getSimpleValueType(); 7887 MVT InVT = In.getSimpleValueType(); 7888 7889 if (Subtarget->hasFp256()) { 7890 if (ResVT.is128BitVector() && 7891 (InVT.is256BitVector() || InVT.is512BitVector()) && 7892 isa<ConstantSDNode>(Idx)) { 7893 return Extract128BitVector(In, IdxVal, DAG, dl); 7894 } 7895 if (ResVT.is256BitVector() && InVT.is512BitVector() && 7896 isa<ConstantSDNode>(Idx)) { 7897 return Extract256BitVector(In, IdxVal, DAG, dl); 7898 } 7899 } 7900 return SDValue(); 7901} 7902 7903// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7904// simple superregister reference or explicit instructions to insert 7905// the upper bits of a vector. 7906static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7907 SelectionDAG &DAG) { 7908 if (Subtarget->hasFp256()) { 7909 SDLoc dl(Op.getNode()); 7910 SDValue Vec = Op.getNode()->getOperand(0); 7911 SDValue SubVec = Op.getNode()->getOperand(1); 7912 SDValue Idx = Op.getNode()->getOperand(2); 7913 7914 if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || 7915 Op.getNode()->getSimpleValueType(0).is512BitVector()) && 7916 SubVec.getNode()->getSimpleValueType(0).is128BitVector() && 7917 isa<ConstantSDNode>(Idx)) { 7918 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7919 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7920 } 7921 7922 if (Op.getNode()->getSimpleValueType(0).is512BitVector() && 7923 SubVec.getNode()->getSimpleValueType(0).is256BitVector() && 7924 isa<ConstantSDNode>(Idx)) { 7925 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7926 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 7927 } 7928 } 7929 return SDValue(); 7930} 7931 7932// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7933// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7934// one of the above mentioned nodes. It has to be wrapped because otherwise 7935// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7936// be used to form addressing mode. These wrapped nodes will be selected 7937// into MOV32ri. 7938SDValue 7939X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7940 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7941 7942 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7943 // global base reg. 7944 unsigned char OpFlag = 0; 7945 unsigned WrapperKind = X86ISD::Wrapper; 7946 CodeModel::Model M = getTargetMachine().getCodeModel(); 7947 7948 if (Subtarget->isPICStyleRIPRel() && 7949 (M == CodeModel::Small || M == CodeModel::Kernel)) 7950 WrapperKind = X86ISD::WrapperRIP; 7951 else if (Subtarget->isPICStyleGOT()) 7952 OpFlag = X86II::MO_GOTOFF; 7953 else if (Subtarget->isPICStyleStubPIC()) 7954 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7955 7956 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7957 CP->getAlignment(), 7958 CP->getOffset(), OpFlag); 7959 SDLoc DL(CP); 7960 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7961 // With PIC, the address is actually $g + Offset. 7962 if (OpFlag) { 7963 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7964 DAG.getNode(X86ISD::GlobalBaseReg, 7965 SDLoc(), getPointerTy()), 7966 Result); 7967 } 7968 7969 return Result; 7970} 7971 7972SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7973 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7974 7975 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7976 // global base reg. 7977 unsigned char OpFlag = 0; 7978 unsigned WrapperKind = X86ISD::Wrapper; 7979 CodeModel::Model M = getTargetMachine().getCodeModel(); 7980 7981 if (Subtarget->isPICStyleRIPRel() && 7982 (M == CodeModel::Small || M == CodeModel::Kernel)) 7983 WrapperKind = X86ISD::WrapperRIP; 7984 else if (Subtarget->isPICStyleGOT()) 7985 OpFlag = X86II::MO_GOTOFF; 7986 else if (Subtarget->isPICStyleStubPIC()) 7987 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7988 7989 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7990 OpFlag); 7991 SDLoc DL(JT); 7992 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7993 7994 // With PIC, the address is actually $g + Offset. 7995 if (OpFlag) 7996 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7997 DAG.getNode(X86ISD::GlobalBaseReg, 7998 SDLoc(), getPointerTy()), 7999 Result); 8000 8001 return Result; 8002} 8003 8004SDValue 8005X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 8006 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 8007 8008 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8009 // global base reg. 8010 unsigned char OpFlag = 0; 8011 unsigned WrapperKind = X86ISD::Wrapper; 8012 CodeModel::Model M = getTargetMachine().getCodeModel(); 8013 8014 if (Subtarget->isPICStyleRIPRel() && 8015 (M == CodeModel::Small || M == CodeModel::Kernel)) { 8016 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 8017 OpFlag = X86II::MO_GOTPCREL; 8018 WrapperKind = X86ISD::WrapperRIP; 8019 } else if (Subtarget->isPICStyleGOT()) { 8020 OpFlag = X86II::MO_GOT; 8021 } else if (Subtarget->isPICStyleStubPIC()) { 8022 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 8023 } else if (Subtarget->isPICStyleStubNoDynamic()) { 8024 OpFlag = X86II::MO_DARWIN_NONLAZY; 8025 } 8026 8027 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 8028 8029 SDLoc DL(Op); 8030 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8031 8032 // With PIC, the address is actually $g + Offset. 8033 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 8034 !Subtarget->is64Bit()) { 8035 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8036 DAG.getNode(X86ISD::GlobalBaseReg, 8037 SDLoc(), getPointerTy()), 8038 Result); 8039 } 8040 8041 // For symbols that require a load from a stub to get the address, emit the 8042 // load. 8043 if (isGlobalStubReference(OpFlag)) 8044 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 8045 MachinePointerInfo::getGOT(), false, false, false, 0); 8046 8047 return Result; 8048} 8049 8050SDValue 8051X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 8052 // Create the TargetBlockAddressAddress node. 8053 unsigned char OpFlags = 8054 Subtarget->ClassifyBlockAddressReference(); 8055 CodeModel::Model M = getTargetMachine().getCodeModel(); 8056 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 8057 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 8058 SDLoc dl(Op); 8059 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 8060 OpFlags); 8061 8062 if (Subtarget->isPICStyleRIPRel() && 8063 (M == CodeModel::Small || M == CodeModel::Kernel)) 8064 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8065 else 8066 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8067 8068 // With PIC, the address is actually $g + Offset. 8069 if (isGlobalRelativeToPICBase(OpFlags)) { 8070 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8071 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8072 Result); 8073 } 8074 8075 return Result; 8076} 8077 8078SDValue 8079X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 8080 int64_t Offset, SelectionDAG &DAG) const { 8081 // Create the TargetGlobalAddress node, folding in the constant 8082 // offset if it is legal. 8083 unsigned char OpFlags = 8084 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 8085 CodeModel::Model M = getTargetMachine().getCodeModel(); 8086 SDValue Result; 8087 if (OpFlags == X86II::MO_NO_FLAG && 8088 X86::isOffsetSuitableForCodeModel(Offset, M)) { 8089 // A direct static reference to a global. 8090 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 8091 Offset = 0; 8092 } else { 8093 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 8094 } 8095 8096 if (Subtarget->isPICStyleRIPRel() && 8097 (M == CodeModel::Small || M == CodeModel::Kernel)) 8098 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8099 else 8100 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8101 8102 // With PIC, the address is actually $g + Offset. 8103 if (isGlobalRelativeToPICBase(OpFlags)) { 8104 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8105 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8106 Result); 8107 } 8108 8109 // For globals that require a load from a stub to get the address, emit the 8110 // load. 8111 if (isGlobalStubReference(OpFlags)) 8112 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 8113 MachinePointerInfo::getGOT(), false, false, false, 0); 8114 8115 // If there was a non-zero offset that we didn't fold, create an explicit 8116 // addition for it. 8117 if (Offset != 0) 8118 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 8119 DAG.getConstant(Offset, getPointerTy())); 8120 8121 return Result; 8122} 8123 8124SDValue 8125X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 8126 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 8127 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 8128 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 8129} 8130 8131static SDValue 8132GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 8133 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 8134 unsigned char OperandFlags, bool LocalDynamic = false) { 8135 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8136 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8137 SDLoc dl(GA); 8138 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8139 GA->getValueType(0), 8140 GA->getOffset(), 8141 OperandFlags); 8142 8143 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 8144 : X86ISD::TLSADDR; 8145 8146 if (InFlag) { 8147 SDValue Ops[] = { Chain, TGA, *InFlag }; 8148 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8149 } else { 8150 SDValue Ops[] = { Chain, TGA }; 8151 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8152 } 8153 8154 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 8155 MFI->setAdjustsStack(true); 8156 8157 SDValue Flag = Chain.getValue(1); 8158 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 8159} 8160 8161// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 8162static SDValue 8163LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8164 const EVT PtrVT) { 8165 SDValue InFlag; 8166 SDLoc dl(GA); // ? function entry point might be better 8167 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8168 DAG.getNode(X86ISD::GlobalBaseReg, 8169 SDLoc(), PtrVT), InFlag); 8170 InFlag = Chain.getValue(1); 8171 8172 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 8173} 8174 8175// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 8176static SDValue 8177LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8178 const EVT PtrVT) { 8179 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 8180 X86::RAX, X86II::MO_TLSGD); 8181} 8182 8183static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 8184 SelectionDAG &DAG, 8185 const EVT PtrVT, 8186 bool is64Bit) { 8187 SDLoc dl(GA); 8188 8189 // Get the start address of the TLS block for this module. 8190 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 8191 .getInfo<X86MachineFunctionInfo>(); 8192 MFI->incNumLocalDynamicTLSAccesses(); 8193 8194 SDValue Base; 8195 if (is64Bit) { 8196 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 8197 X86II::MO_TLSLD, /*LocalDynamic=*/true); 8198 } else { 8199 SDValue InFlag; 8200 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8201 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 8202 InFlag = Chain.getValue(1); 8203 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 8204 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 8205 } 8206 8207 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 8208 // of Base. 8209 8210 // Build x@dtpoff. 8211 unsigned char OperandFlags = X86II::MO_DTPOFF; 8212 unsigned WrapperKind = X86ISD::Wrapper; 8213 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8214 GA->getValueType(0), 8215 GA->getOffset(), OperandFlags); 8216 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8217 8218 // Add x@dtpoff with the base. 8219 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 8220} 8221 8222// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 8223static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8224 const EVT PtrVT, TLSModel::Model model, 8225 bool is64Bit, bool isPIC) { 8226 SDLoc dl(GA); 8227 8228 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 8229 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 8230 is64Bit ? 257 : 256)); 8231 8232 SDValue ThreadPointer = 8233 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), 8234 MachinePointerInfo(Ptr), false, false, false, 0); 8235 8236 unsigned char OperandFlags = 0; 8237 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 8238 // initialexec. 8239 unsigned WrapperKind = X86ISD::Wrapper; 8240 if (model == TLSModel::LocalExec) { 8241 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 8242 } else if (model == TLSModel::InitialExec) { 8243 if (is64Bit) { 8244 OperandFlags = X86II::MO_GOTTPOFF; 8245 WrapperKind = X86ISD::WrapperRIP; 8246 } else { 8247 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 8248 } 8249 } else { 8250 llvm_unreachable("Unexpected model"); 8251 } 8252 8253 // emit "addl x@ntpoff,%eax" (local exec) 8254 // or "addl x@indntpoff,%eax" (initial exec) 8255 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 8256 SDValue TGA = 8257 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 8258 GA->getOffset(), OperandFlags); 8259 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8260 8261 if (model == TLSModel::InitialExec) { 8262 if (isPIC && !is64Bit) { 8263 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 8264 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 8265 Offset); 8266 } 8267 8268 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 8269 MachinePointerInfo::getGOT(), false, false, false, 0); 8270 } 8271 8272 // The address of the thread local variable is the add of the thread 8273 // pointer with the offset of the variable. 8274 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 8275} 8276 8277SDValue 8278X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 8279 8280 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8281 const GlobalValue *GV = GA->getGlobal(); 8282 8283 if (Subtarget->isTargetELF()) { 8284 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 8285 8286 switch (model) { 8287 case TLSModel::GeneralDynamic: 8288 if (Subtarget->is64Bit()) 8289 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 8290 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 8291 case TLSModel::LocalDynamic: 8292 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 8293 Subtarget->is64Bit()); 8294 case TLSModel::InitialExec: 8295 case TLSModel::LocalExec: 8296 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 8297 Subtarget->is64Bit(), 8298 getTargetMachine().getRelocationModel() == Reloc::PIC_); 8299 } 8300 llvm_unreachable("Unknown TLS model."); 8301 } 8302 8303 if (Subtarget->isTargetDarwin()) { 8304 // Darwin only has one model of TLS. Lower to that. 8305 unsigned char OpFlag = 0; 8306 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 8307 X86ISD::WrapperRIP : X86ISD::Wrapper; 8308 8309 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8310 // global base reg. 8311 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 8312 !Subtarget->is64Bit(); 8313 if (PIC32) 8314 OpFlag = X86II::MO_TLVP_PIC_BASE; 8315 else 8316 OpFlag = X86II::MO_TLVP; 8317 SDLoc DL(Op); 8318 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 8319 GA->getValueType(0), 8320 GA->getOffset(), OpFlag); 8321 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8322 8323 // With PIC32, the address is actually $g + Offset. 8324 if (PIC32) 8325 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8326 DAG.getNode(X86ISD::GlobalBaseReg, 8327 SDLoc(), getPointerTy()), 8328 Offset); 8329 8330 // Lowering the machine isd will make sure everything is in the right 8331 // location. 8332 SDValue Chain = DAG.getEntryNode(); 8333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8334 SDValue Args[] = { Chain, Offset }; 8335 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 8336 8337 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 8338 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8339 MFI->setAdjustsStack(true); 8340 8341 // And our return value (tls address) is in the standard call return value 8342 // location. 8343 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 8344 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 8345 Chain.getValue(1)); 8346 } 8347 8348 if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { 8349 // Just use the implicit TLS architecture 8350 // Need to generate someting similar to: 8351 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 8352 // ; from TEB 8353 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 8354 // mov rcx, qword [rdx+rcx*8] 8355 // mov eax, .tls$:tlsvar 8356 // [rax+rcx] contains the address 8357 // Windows 64bit: gs:0x58 8358 // Windows 32bit: fs:__tls_array 8359 8360 // If GV is an alias then use the aliasee for determining 8361 // thread-localness. 8362 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 8363 GV = GA->resolveAliasedGlobal(false); 8364 SDLoc dl(GA); 8365 SDValue Chain = DAG.getEntryNode(); 8366 8367 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 8368 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 8369 // use its literal value of 0x2C. 8370 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 8371 ? Type::getInt8PtrTy(*DAG.getContext(), 8372 256) 8373 : Type::getInt32PtrTy(*DAG.getContext(), 8374 257)); 8375 8376 SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : 8377 (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : 8378 DAG.getExternalSymbol("_tls_array", getPointerTy())); 8379 8380 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, 8381 MachinePointerInfo(Ptr), 8382 false, false, false, 0); 8383 8384 // Load the _tls_index variable 8385 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 8386 if (Subtarget->is64Bit()) 8387 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 8388 IDX, MachinePointerInfo(), MVT::i32, 8389 false, false, 0); 8390 else 8391 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 8392 false, false, false, 0); 8393 8394 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 8395 getPointerTy()); 8396 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 8397 8398 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 8399 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 8400 false, false, false, 0); 8401 8402 // Get the offset of start of .tls section 8403 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8404 GA->getValueType(0), 8405 GA->getOffset(), X86II::MO_SECREL); 8406 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 8407 8408 // The address of the thread local variable is the add of the thread 8409 // pointer with the offset of the variable. 8410 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 8411 } 8412 8413 llvm_unreachable("TLS not implemented for this target."); 8414} 8415 8416/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 8417/// and take a 2 x i32 value to shift plus a shift amount. 8418SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 8419 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 8420 EVT VT = Op.getValueType(); 8421 unsigned VTBits = VT.getSizeInBits(); 8422 SDLoc dl(Op); 8423 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 8424 SDValue ShOpLo = Op.getOperand(0); 8425 SDValue ShOpHi = Op.getOperand(1); 8426 SDValue ShAmt = Op.getOperand(2); 8427 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 8428 DAG.getConstant(VTBits - 1, MVT::i8)) 8429 : DAG.getConstant(0, VT); 8430 8431 SDValue Tmp2, Tmp3; 8432 if (Op.getOpcode() == ISD::SHL_PARTS) { 8433 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 8434 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 8435 } else { 8436 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 8437 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 8438 } 8439 8440 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 8441 DAG.getConstant(VTBits, MVT::i8)); 8442 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8443 AndNode, DAG.getConstant(0, MVT::i8)); 8444 8445 SDValue Hi, Lo; 8446 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8447 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 8448 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 8449 8450 if (Op.getOpcode() == ISD::SHL_PARTS) { 8451 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8452 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8453 } else { 8454 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8455 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8456 } 8457 8458 SDValue Ops[2] = { Lo, Hi }; 8459 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 8460} 8461 8462SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 8463 SelectionDAG &DAG) const { 8464 EVT SrcVT = Op.getOperand(0).getValueType(); 8465 8466 if (SrcVT.isVector()) 8467 return SDValue(); 8468 8469 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 8470 "Unknown SINT_TO_FP to lower!"); 8471 8472 // These are really Legal; return the operand so the caller accepts it as 8473 // Legal. 8474 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 8475 return Op; 8476 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 8477 Subtarget->is64Bit()) { 8478 return Op; 8479 } 8480 8481 SDLoc dl(Op); 8482 unsigned Size = SrcVT.getSizeInBits()/8; 8483 MachineFunction &MF = DAG.getMachineFunction(); 8484 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 8485 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8486 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8487 StackSlot, 8488 MachinePointerInfo::getFixedStack(SSFI), 8489 false, false, 0); 8490 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 8491} 8492 8493SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 8494 SDValue StackSlot, 8495 SelectionDAG &DAG) const { 8496 // Build the FILD 8497 SDLoc DL(Op); 8498 SDVTList Tys; 8499 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 8500 if (useSSE) 8501 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 8502 else 8503 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 8504 8505 unsigned ByteSize = SrcVT.getSizeInBits()/8; 8506 8507 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 8508 MachineMemOperand *MMO; 8509 if (FI) { 8510 int SSFI = FI->getIndex(); 8511 MMO = 8512 DAG.getMachineFunction() 8513 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8514 MachineMemOperand::MOLoad, ByteSize, ByteSize); 8515 } else { 8516 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 8517 StackSlot = StackSlot.getOperand(1); 8518 } 8519 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 8520 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 8521 X86ISD::FILD, DL, 8522 Tys, Ops, array_lengthof(Ops), 8523 SrcVT, MMO); 8524 8525 if (useSSE) { 8526 Chain = Result.getValue(1); 8527 SDValue InFlag = Result.getValue(2); 8528 8529 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 8530 // shouldn't be necessary except that RFP cannot be live across 8531 // multiple blocks. When stackifier is fixed, they can be uncoupled. 8532 MachineFunction &MF = DAG.getMachineFunction(); 8533 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 8534 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 8535 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8536 Tys = DAG.getVTList(MVT::Other); 8537 SDValue Ops[] = { 8538 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 8539 }; 8540 MachineMemOperand *MMO = 8541 DAG.getMachineFunction() 8542 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8543 MachineMemOperand::MOStore, SSFISize, SSFISize); 8544 8545 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 8546 Ops, array_lengthof(Ops), 8547 Op.getValueType(), MMO); 8548 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 8549 MachinePointerInfo::getFixedStack(SSFI), 8550 false, false, false, 0); 8551 } 8552 8553 return Result; 8554} 8555 8556// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 8557SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 8558 SelectionDAG &DAG) const { 8559 // This algorithm is not obvious. Here it is what we're trying to output: 8560 /* 8561 movq %rax, %xmm0 8562 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 8563 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 8564 #ifdef __SSE3__ 8565 haddpd %xmm0, %xmm0 8566 #else 8567 pshufd $0x4e, %xmm0, %xmm1 8568 addpd %xmm1, %xmm0 8569 #endif 8570 */ 8571 8572 SDLoc dl(Op); 8573 LLVMContext *Context = DAG.getContext(); 8574 8575 // Build some magic constants. 8576 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 8577 Constant *C0 = ConstantDataVector::get(*Context, CV0); 8578 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 8579 8580 SmallVector<Constant*,2> CV1; 8581 CV1.push_back( 8582 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8583 APInt(64, 0x4330000000000000ULL)))); 8584 CV1.push_back( 8585 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8586 APInt(64, 0x4530000000000000ULL)))); 8587 Constant *C1 = ConstantVector::get(CV1); 8588 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 8589 8590 // Load the 64-bit value into an XMM register. 8591 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 8592 Op.getOperand(0)); 8593 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 8594 MachinePointerInfo::getConstantPool(), 8595 false, false, false, 16); 8596 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 8597 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 8598 CLod0); 8599 8600 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 8601 MachinePointerInfo::getConstantPool(), 8602 false, false, false, 16); 8603 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 8604 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 8605 SDValue Result; 8606 8607 if (Subtarget->hasSSE3()) { 8608 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 8609 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 8610 } else { 8611 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 8612 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 8613 S2F, 0x4E, DAG); 8614 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 8615 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 8616 Sub); 8617 } 8618 8619 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 8620 DAG.getIntPtrConstant(0)); 8621} 8622 8623// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 8624SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 8625 SelectionDAG &DAG) const { 8626 SDLoc dl(Op); 8627 // FP constant to bias correct the final result. 8628 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 8629 MVT::f64); 8630 8631 // Load the 32-bit value into an XMM register. 8632 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 8633 Op.getOperand(0)); 8634 8635 // Zero out the upper parts of the register. 8636 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 8637 8638 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8639 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 8640 DAG.getIntPtrConstant(0)); 8641 8642 // Or the load with the bias. 8643 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 8644 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8645 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8646 MVT::v2f64, Load)), 8647 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8648 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8649 MVT::v2f64, Bias))); 8650 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8651 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 8652 DAG.getIntPtrConstant(0)); 8653 8654 // Subtract the bias. 8655 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 8656 8657 // Handle final rounding. 8658 EVT DestVT = Op.getValueType(); 8659 8660 if (DestVT.bitsLT(MVT::f64)) 8661 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 8662 DAG.getIntPtrConstant(0)); 8663 if (DestVT.bitsGT(MVT::f64)) 8664 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 8665 8666 // Handle final rounding. 8667 return Sub; 8668} 8669 8670SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 8671 SelectionDAG &DAG) const { 8672 SDValue N0 = Op.getOperand(0); 8673 EVT SVT = N0.getValueType(); 8674 SDLoc dl(Op); 8675 8676 assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || 8677 SVT == MVT::v8i8 || SVT == MVT::v8i16) && 8678 "Custom UINT_TO_FP is not supported!"); 8679 8680 EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 8681 SVT.getVectorNumElements()); 8682 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 8683 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 8684} 8685 8686SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 8687 SelectionDAG &DAG) const { 8688 SDValue N0 = Op.getOperand(0); 8689 SDLoc dl(Op); 8690 8691 if (Op.getValueType().isVector()) 8692 return lowerUINT_TO_FP_vec(Op, DAG); 8693 8694 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 8695 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 8696 // the optimization here. 8697 if (DAG.SignBitIsZero(N0)) 8698 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 8699 8700 EVT SrcVT = N0.getValueType(); 8701 EVT DstVT = Op.getValueType(); 8702 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 8703 return LowerUINT_TO_FP_i64(Op, DAG); 8704 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 8705 return LowerUINT_TO_FP_i32(Op, DAG); 8706 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 8707 return SDValue(); 8708 8709 // Make a 64-bit buffer, and use it to build an FILD. 8710 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 8711 if (SrcVT == MVT::i32) { 8712 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 8713 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 8714 getPointerTy(), StackSlot, WordOff); 8715 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8716 StackSlot, MachinePointerInfo(), 8717 false, false, 0); 8718 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 8719 OffsetSlot, MachinePointerInfo(), 8720 false, false, 0); 8721 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 8722 return Fild; 8723 } 8724 8725 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 8726 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8727 StackSlot, MachinePointerInfo(), 8728 false, false, 0); 8729 // For i64 source, we need to add the appropriate power of 2 if the input 8730 // was negative. This is the same as the optimization in 8731 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 8732 // we must be careful to do the computation in x87 extended precision, not 8733 // in SSE. (The generic code can't know it's OK to do this, or how to.) 8734 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 8735 MachineMemOperand *MMO = 8736 DAG.getMachineFunction() 8737 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8738 MachineMemOperand::MOLoad, 8, 8); 8739 8740 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 8741 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 8742 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 8743 array_lengthof(Ops), MVT::i64, MMO); 8744 8745 APInt FF(32, 0x5F800000ULL); 8746 8747 // Check whether the sign bit is set. 8748 SDValue SignSet = DAG.getSetCC(dl, 8749 getSetCCResultType(*DAG.getContext(), MVT::i64), 8750 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 8751 ISD::SETLT); 8752 8753 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 8754 SDValue FudgePtr = DAG.getConstantPool( 8755 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 8756 getPointerTy()); 8757 8758 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 8759 SDValue Zero = DAG.getIntPtrConstant(0); 8760 SDValue Four = DAG.getIntPtrConstant(4); 8761 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 8762 Zero, Four); 8763 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 8764 8765 // Load the value out, extending it from f32 to f80. 8766 // FIXME: Avoid the extend by constructing the right constant pool? 8767 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 8768 FudgePtr, MachinePointerInfo::getConstantPool(), 8769 MVT::f32, false, false, 4); 8770 // Extend everything to 80 bits to force it to be done on x87. 8771 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 8772 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 8773} 8774 8775std::pair<SDValue,SDValue> 8776X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 8777 bool IsSigned, bool IsReplace) const { 8778 SDLoc DL(Op); 8779 8780 EVT DstTy = Op.getValueType(); 8781 8782 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 8783 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 8784 DstTy = MVT::i64; 8785 } 8786 8787 assert(DstTy.getSimpleVT() <= MVT::i64 && 8788 DstTy.getSimpleVT() >= MVT::i16 && 8789 "Unknown FP_TO_INT to lower!"); 8790 8791 // These are really Legal. 8792 if (DstTy == MVT::i32 && 8793 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8794 return std::make_pair(SDValue(), SDValue()); 8795 if (Subtarget->is64Bit() && 8796 DstTy == MVT::i64 && 8797 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8798 return std::make_pair(SDValue(), SDValue()); 8799 8800 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8801 // stack slot, or into the FTOL runtime function. 8802 MachineFunction &MF = DAG.getMachineFunction(); 8803 unsigned MemSize = DstTy.getSizeInBits()/8; 8804 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8805 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8806 8807 unsigned Opc; 8808 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8809 Opc = X86ISD::WIN_FTOL; 8810 else 8811 switch (DstTy.getSimpleVT().SimpleTy) { 8812 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8813 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8814 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8815 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8816 } 8817 8818 SDValue Chain = DAG.getEntryNode(); 8819 SDValue Value = Op.getOperand(0); 8820 EVT TheVT = Op.getOperand(0).getValueType(); 8821 // FIXME This causes a redundant load/store if the SSE-class value is already 8822 // in memory, such as if it is on the callstack. 8823 if (isScalarFPTypeInSSEReg(TheVT)) { 8824 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8825 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8826 MachinePointerInfo::getFixedStack(SSFI), 8827 false, false, 0); 8828 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8829 SDValue Ops[] = { 8830 Chain, StackSlot, DAG.getValueType(TheVT) 8831 }; 8832 8833 MachineMemOperand *MMO = 8834 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8835 MachineMemOperand::MOLoad, MemSize, MemSize); 8836 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 8837 array_lengthof(Ops), DstTy, MMO); 8838 Chain = Value.getValue(1); 8839 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8840 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8841 } 8842 8843 MachineMemOperand *MMO = 8844 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8845 MachineMemOperand::MOStore, MemSize, MemSize); 8846 8847 if (Opc != X86ISD::WIN_FTOL) { 8848 // Build the FP_TO_INT*_IN_MEM 8849 SDValue Ops[] = { Chain, Value, StackSlot }; 8850 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8851 Ops, array_lengthof(Ops), DstTy, 8852 MMO); 8853 return std::make_pair(FIST, StackSlot); 8854 } else { 8855 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8856 DAG.getVTList(MVT::Other, MVT::Glue), 8857 Chain, Value); 8858 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8859 MVT::i32, ftol.getValue(1)); 8860 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8861 MVT::i32, eax.getValue(2)); 8862 SDValue Ops[] = { eax, edx }; 8863 SDValue pair = IsReplace 8864 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) 8865 : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); 8866 return std::make_pair(pair, SDValue()); 8867 } 8868} 8869 8870static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 8871 const X86Subtarget *Subtarget) { 8872 MVT VT = Op->getSimpleValueType(0); 8873 SDValue In = Op->getOperand(0); 8874 MVT InVT = In.getSimpleValueType(); 8875 SDLoc dl(Op); 8876 8877 // Optimize vectors in AVX mode: 8878 // 8879 // v8i16 -> v8i32 8880 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 8881 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 8882 // Concat upper and lower parts. 8883 // 8884 // v4i32 -> v4i64 8885 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 8886 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 8887 // Concat upper and lower parts. 8888 // 8889 8890 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 8891 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 8892 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 8893 return SDValue(); 8894 8895 if (Subtarget->hasInt256()) 8896 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); 8897 8898 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 8899 SDValue Undef = DAG.getUNDEF(InVT); 8900 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 8901 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8902 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8903 8904 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 8905 VT.getVectorNumElements()/2); 8906 8907 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 8908 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 8909 8910 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 8911} 8912 8913static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 8914 SelectionDAG &DAG) { 8915 MVT VT = Op->getValueType(0).getSimpleVT(); 8916 SDValue In = Op->getOperand(0); 8917 MVT InVT = In.getValueType().getSimpleVT(); 8918 SDLoc DL(Op); 8919 unsigned int NumElts = VT.getVectorNumElements(); 8920 if (NumElts != 8 && NumElts != 16) 8921 return SDValue(); 8922 8923 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 8924 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 8925 8926 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; 8927 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8928 // Now we have only mask extension 8929 assert(InVT.getVectorElementType() == MVT::i1); 8930 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); 8931 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 8932 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 8933 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 8934 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 8935 MachinePointerInfo::getConstantPool(), 8936 false, false, false, Alignment); 8937 8938 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); 8939 if (VT.is512BitVector()) 8940 return Brcst; 8941 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); 8942} 8943 8944static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8945 SelectionDAG &DAG) { 8946 if (Subtarget->hasFp256()) { 8947 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8948 if (Res.getNode()) 8949 return Res; 8950 } 8951 8952 return SDValue(); 8953} 8954 8955static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8956 SelectionDAG &DAG) { 8957 SDLoc DL(Op); 8958 MVT VT = Op.getSimpleValueType(); 8959 SDValue In = Op.getOperand(0); 8960 MVT SVT = In.getSimpleValueType(); 8961 8962 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 8963 return LowerZERO_EXTEND_AVX512(Op, DAG); 8964 8965 if (Subtarget->hasFp256()) { 8966 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8967 if (Res.getNode()) 8968 return Res; 8969 } 8970 8971 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 8972 VT.getVectorNumElements() != SVT.getVectorNumElements()); 8973 return SDValue(); 8974} 8975 8976SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 8977 SDLoc DL(Op); 8978 MVT VT = Op.getSimpleValueType(); 8979 SDValue In = Op.getOperand(0); 8980 MVT InVT = In.getSimpleValueType(); 8981 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 8982 "Invalid TRUNCATE operation"); 8983 8984 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { 8985 if (VT.getVectorElementType().getSizeInBits() >=8) 8986 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 8987 8988 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 8989 unsigned NumElts = InVT.getVectorNumElements(); 8990 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 8991 if (InVT.getSizeInBits() < 512) { 8992 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 8993 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 8994 InVT = ExtVT; 8995 } 8996 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); 8997 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 8998 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 8999 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 9000 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 9001 MachinePointerInfo::getConstantPool(), 9002 false, false, false, Alignment); 9003 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); 9004 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 9005 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 9006 } 9007 9008 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 9009 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 9010 if (Subtarget->hasInt256()) { 9011 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 9012 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 9013 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 9014 ShufMask); 9015 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 9016 DAG.getIntPtrConstant(0)); 9017 } 9018 9019 // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. 9020 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9021 DAG.getIntPtrConstant(0)); 9022 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9023 DAG.getIntPtrConstant(2)); 9024 9025 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9026 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9027 9028 // The PSHUFD mask: 9029 static const int ShufMask1[] = {0, 2, 0, 0}; 9030 SDValue Undef = DAG.getUNDEF(VT); 9031 OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); 9032 OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); 9033 9034 // The MOVLHPS mask: 9035 static const int ShufMask2[] = {0, 1, 4, 5}; 9036 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); 9037 } 9038 9039 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 9040 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 9041 if (Subtarget->hasInt256()) { 9042 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 9043 9044 SmallVector<SDValue,32> pshufbMask; 9045 for (unsigned i = 0; i < 2; ++i) { 9046 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 9047 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 9048 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 9049 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 9050 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 9051 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 9052 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 9053 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 9054 for (unsigned j = 0; j < 8; ++j) 9055 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 9056 } 9057 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, 9058 &pshufbMask[0], 32); 9059 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 9060 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 9061 9062 static const int ShufMask[] = {0, 2, -1, -1}; 9063 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 9064 &ShufMask[0]); 9065 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9066 DAG.getIntPtrConstant(0)); 9067 return DAG.getNode(ISD::BITCAST, DL, VT, In); 9068 } 9069 9070 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9071 DAG.getIntPtrConstant(0)); 9072 9073 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9074 DAG.getIntPtrConstant(4)); 9075 9076 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 9077 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 9078 9079 // The PSHUFB mask: 9080 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 9081 -1, -1, -1, -1, -1, -1, -1, -1}; 9082 9083 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 9084 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 9085 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 9086 9087 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9088 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9089 9090 // The MOVLHPS Mask: 9091 static const int ShufMask2[] = {0, 1, 4, 5}; 9092 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 9093 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 9094 } 9095 9096 // Handle truncation of V256 to V128 using shuffles. 9097 if (!VT.is128BitVector() || !InVT.is256BitVector()) 9098 return SDValue(); 9099 9100 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 9101 9102 unsigned NumElems = VT.getVectorNumElements(); 9103 EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 9104 NumElems * 2); 9105 9106 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 9107 // Prepare truncation shuffle mask 9108 for (unsigned i = 0; i != NumElems; ++i) 9109 MaskVec[i] = i * 2; 9110 SDValue V = DAG.getVectorShuffle(NVT, DL, 9111 DAG.getNode(ISD::BITCAST, DL, NVT, In), 9112 DAG.getUNDEF(NVT), &MaskVec[0]); 9113 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 9114 DAG.getIntPtrConstant(0)); 9115} 9116 9117SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 9118 SelectionDAG &DAG) const { 9119 MVT VT = Op.getSimpleValueType(); 9120 if (VT.isVector()) { 9121 if (VT == MVT::v8i16) 9122 return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, 9123 DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), 9124 MVT::v8i32, Op.getOperand(0))); 9125 return SDValue(); 9126 } 9127 9128 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9129 /*IsSigned=*/ true, /*IsReplace=*/ false); 9130 SDValue FIST = Vals.first, StackSlot = Vals.second; 9131 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 9132 if (FIST.getNode() == 0) return Op; 9133 9134 if (StackSlot.getNode()) 9135 // Load the result. 9136 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9137 FIST, StackSlot, MachinePointerInfo(), 9138 false, false, false, 0); 9139 9140 // The node is the result. 9141 return FIST; 9142} 9143 9144SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 9145 SelectionDAG &DAG) const { 9146 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9147 /*IsSigned=*/ false, /*IsReplace=*/ false); 9148 SDValue FIST = Vals.first, StackSlot = Vals.second; 9149 assert(FIST.getNode() && "Unexpected failure"); 9150 9151 if (StackSlot.getNode()) 9152 // Load the result. 9153 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9154 FIST, StackSlot, MachinePointerInfo(), 9155 false, false, false, 0); 9156 9157 // The node is the result. 9158 return FIST; 9159} 9160 9161static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 9162 SDLoc DL(Op); 9163 MVT VT = Op.getSimpleValueType(); 9164 SDValue In = Op.getOperand(0); 9165 MVT SVT = In.getSimpleValueType(); 9166 9167 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 9168 9169 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 9170 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 9171 In, DAG.getUNDEF(SVT))); 9172} 9173 9174SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { 9175 LLVMContext *Context = DAG.getContext(); 9176 SDLoc dl(Op); 9177 MVT VT = Op.getSimpleValueType(); 9178 MVT EltVT = VT; 9179 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9180 if (VT.isVector()) { 9181 EltVT = VT.getVectorElementType(); 9182 NumElts = VT.getVectorNumElements(); 9183 } 9184 Constant *C; 9185 if (EltVT == MVT::f64) 9186 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9187 APInt(64, ~(1ULL << 63)))); 9188 else 9189 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9190 APInt(32, ~(1U << 31)))); 9191 C = ConstantVector::getSplat(NumElts, C); 9192 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9193 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9194 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9195 MachinePointerInfo::getConstantPool(), 9196 false, false, false, Alignment); 9197 if (VT.isVector()) { 9198 MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9199 return DAG.getNode(ISD::BITCAST, dl, VT, 9200 DAG.getNode(ISD::AND, dl, ANDVT, 9201 DAG.getNode(ISD::BITCAST, dl, ANDVT, 9202 Op.getOperand(0)), 9203 DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); 9204 } 9205 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 9206} 9207 9208SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 9209 LLVMContext *Context = DAG.getContext(); 9210 SDLoc dl(Op); 9211 MVT VT = Op.getSimpleValueType(); 9212 MVT EltVT = VT; 9213 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9214 if (VT.isVector()) { 9215 EltVT = VT.getVectorElementType(); 9216 NumElts = VT.getVectorNumElements(); 9217 } 9218 Constant *C; 9219 if (EltVT == MVT::f64) 9220 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9221 APInt(64, 1ULL << 63))); 9222 else 9223 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9224 APInt(32, 1U << 31))); 9225 C = ConstantVector::getSplat(NumElts, C); 9226 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9227 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9228 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9229 MachinePointerInfo::getConstantPool(), 9230 false, false, false, Alignment); 9231 if (VT.isVector()) { 9232 MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); 9233 return DAG.getNode(ISD::BITCAST, dl, VT, 9234 DAG.getNode(ISD::XOR, dl, XORVT, 9235 DAG.getNode(ISD::BITCAST, dl, XORVT, 9236 Op.getOperand(0)), 9237 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 9238 } 9239 9240 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 9241} 9242 9243SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 9244 LLVMContext *Context = DAG.getContext(); 9245 SDValue Op0 = Op.getOperand(0); 9246 SDValue Op1 = Op.getOperand(1); 9247 SDLoc dl(Op); 9248 MVT VT = Op.getSimpleValueType(); 9249 MVT SrcVT = Op1.getSimpleValueType(); 9250 9251 // If second operand is smaller, extend it first. 9252 if (SrcVT.bitsLT(VT)) { 9253 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 9254 SrcVT = VT; 9255 } 9256 // And if it is bigger, shrink it first. 9257 if (SrcVT.bitsGT(VT)) { 9258 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 9259 SrcVT = VT; 9260 } 9261 9262 // At this point the operands and the result should have the same 9263 // type, and that won't be f80 since that is not custom lowered. 9264 9265 // First get the sign bit of second operand. 9266 SmallVector<Constant*,4> CV; 9267 if (SrcVT == MVT::f64) { 9268 const fltSemantics &Sem = APFloat::IEEEdouble; 9269 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); 9270 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9271 } else { 9272 const fltSemantics &Sem = APFloat::IEEEsingle; 9273 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); 9274 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9275 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9276 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9277 } 9278 Constant *C = ConstantVector::get(CV); 9279 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9280 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 9281 MachinePointerInfo::getConstantPool(), 9282 false, false, false, 16); 9283 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 9284 9285 // Shift sign bit right or left if the two operands have different types. 9286 if (SrcVT.bitsGT(VT)) { 9287 // Op0 is MVT::f32, Op1 is MVT::f64. 9288 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 9289 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 9290 DAG.getConstant(32, MVT::i32)); 9291 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 9292 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 9293 DAG.getIntPtrConstant(0)); 9294 } 9295 9296 // Clear first operand sign bit. 9297 CV.clear(); 9298 if (VT == MVT::f64) { 9299 const fltSemantics &Sem = APFloat::IEEEdouble; 9300 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9301 APInt(64, ~(1ULL << 63))))); 9302 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9303 } else { 9304 const fltSemantics &Sem = APFloat::IEEEsingle; 9305 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9306 APInt(32, ~(1U << 31))))); 9307 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9308 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9309 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9310 } 9311 C = ConstantVector::get(CV); 9312 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9313 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9314 MachinePointerInfo::getConstantPool(), 9315 false, false, false, 16); 9316 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 9317 9318 // Or the value with the sign bit. 9319 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 9320} 9321 9322static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 9323 SDValue N0 = Op.getOperand(0); 9324 SDLoc dl(Op); 9325 MVT VT = Op.getSimpleValueType(); 9326 9327 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 9328 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 9329 DAG.getConstant(1, VT)); 9330 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 9331} 9332 9333// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 9334// 9335static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 9336 SelectionDAG &DAG) { 9337 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 9338 9339 if (!Subtarget->hasSSE41()) 9340 return SDValue(); 9341 9342 if (!Op->hasOneUse()) 9343 return SDValue(); 9344 9345 SDNode *N = Op.getNode(); 9346 SDLoc DL(N); 9347 9348 SmallVector<SDValue, 8> Opnds; 9349 DenseMap<SDValue, unsigned> VecInMap; 9350 EVT VT = MVT::Other; 9351 9352 // Recognize a special case where a vector is casted into wide integer to 9353 // test all 0s. 9354 Opnds.push_back(N->getOperand(0)); 9355 Opnds.push_back(N->getOperand(1)); 9356 9357 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 9358 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 9359 // BFS traverse all OR'd operands. 9360 if (I->getOpcode() == ISD::OR) { 9361 Opnds.push_back(I->getOperand(0)); 9362 Opnds.push_back(I->getOperand(1)); 9363 // Re-evaluate the number of nodes to be traversed. 9364 e += 2; // 2 more nodes (LHS and RHS) are pushed. 9365 continue; 9366 } 9367 9368 // Quit if a non-EXTRACT_VECTOR_ELT 9369 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9370 return SDValue(); 9371 9372 // Quit if without a constant index. 9373 SDValue Idx = I->getOperand(1); 9374 if (!isa<ConstantSDNode>(Idx)) 9375 return SDValue(); 9376 9377 SDValue ExtractedFromVec = I->getOperand(0); 9378 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 9379 if (M == VecInMap.end()) { 9380 VT = ExtractedFromVec.getValueType(); 9381 // Quit if not 128/256-bit vector. 9382 if (!VT.is128BitVector() && !VT.is256BitVector()) 9383 return SDValue(); 9384 // Quit if not the same type. 9385 if (VecInMap.begin() != VecInMap.end() && 9386 VT != VecInMap.begin()->first.getValueType()) 9387 return SDValue(); 9388 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 9389 } 9390 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 9391 } 9392 9393 assert((VT.is128BitVector() || VT.is256BitVector()) && 9394 "Not extracted from 128-/256-bit vector."); 9395 9396 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 9397 SmallVector<SDValue, 8> VecIns; 9398 9399 for (DenseMap<SDValue, unsigned>::const_iterator 9400 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 9401 // Quit if not all elements are used. 9402 if (I->second != FullMask) 9403 return SDValue(); 9404 VecIns.push_back(I->first); 9405 } 9406 9407 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9408 9409 // Cast all vectors into TestVT for PTEST. 9410 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 9411 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 9412 9413 // If more than one full vectors are evaluated, OR them first before PTEST. 9414 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 9415 // Each iteration will OR 2 nodes and append the result until there is only 9416 // 1 node left, i.e. the final OR'd value of all vectors. 9417 SDValue LHS = VecIns[Slot]; 9418 SDValue RHS = VecIns[Slot + 1]; 9419 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 9420 } 9421 9422 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 9423 VecIns.back(), VecIns.back()); 9424} 9425 9426/// Emit nodes that will be selected as "test Op0,Op0", or something 9427/// equivalent. 9428SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 9429 SelectionDAG &DAG) const { 9430 SDLoc dl(Op); 9431 9432 // CF and OF aren't always set the way we want. Determine which 9433 // of these we need. 9434 bool NeedCF = false; 9435 bool NeedOF = false; 9436 switch (X86CC) { 9437 default: break; 9438 case X86::COND_A: case X86::COND_AE: 9439 case X86::COND_B: case X86::COND_BE: 9440 NeedCF = true; 9441 break; 9442 case X86::COND_G: case X86::COND_GE: 9443 case X86::COND_L: case X86::COND_LE: 9444 case X86::COND_O: case X86::COND_NO: 9445 NeedOF = true; 9446 break; 9447 } 9448 9449 // See if we can use the EFLAGS value from the operand instead of 9450 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 9451 // we prove that the arithmetic won't overflow, we can't use OF or CF. 9452 if (Op.getResNo() != 0 || NeedOF || NeedCF) 9453 // Emit a CMP with 0, which is the TEST pattern. 9454 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9455 DAG.getConstant(0, Op.getValueType())); 9456 9457 unsigned Opcode = 0; 9458 unsigned NumOperands = 0; 9459 9460 // Truncate operations may prevent the merge of the SETCC instruction 9461 // and the arithmetic instruction before it. Attempt to truncate the operands 9462 // of the arithmetic instruction and use a reduced bit-width instruction. 9463 bool NeedTruncation = false; 9464 SDValue ArithOp = Op; 9465 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 9466 SDValue Arith = Op->getOperand(0); 9467 // Both the trunc and the arithmetic op need to have one user each. 9468 if (Arith->hasOneUse()) 9469 switch (Arith.getOpcode()) { 9470 default: break; 9471 case ISD::ADD: 9472 case ISD::SUB: 9473 case ISD::AND: 9474 case ISD::OR: 9475 case ISD::XOR: { 9476 NeedTruncation = true; 9477 ArithOp = Arith; 9478 } 9479 } 9480 } 9481 9482 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 9483 // which may be the result of a CAST. We use the variable 'Op', which is the 9484 // non-casted variable when we check for possible users. 9485 switch (ArithOp.getOpcode()) { 9486 case ISD::ADD: 9487 // Due to an isel shortcoming, be conservative if this add is likely to be 9488 // selected as part of a load-modify-store instruction. When the root node 9489 // in a match is a store, isel doesn't know how to remap non-chain non-flag 9490 // uses of other nodes in the match, such as the ADD in this case. This 9491 // leads to the ADD being left around and reselected, with the result being 9492 // two adds in the output. Alas, even if none our users are stores, that 9493 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 9494 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 9495 // climbing the DAG back to the root, and it doesn't seem to be worth the 9496 // effort. 9497 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9498 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9499 if (UI->getOpcode() != ISD::CopyToReg && 9500 UI->getOpcode() != ISD::SETCC && 9501 UI->getOpcode() != ISD::STORE) 9502 goto default_case; 9503 9504 if (ConstantSDNode *C = 9505 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 9506 // An add of one will be selected as an INC. 9507 if (C->getAPIntValue() == 1) { 9508 Opcode = X86ISD::INC; 9509 NumOperands = 1; 9510 break; 9511 } 9512 9513 // An add of negative one (subtract of one) will be selected as a DEC. 9514 if (C->getAPIntValue().isAllOnesValue()) { 9515 Opcode = X86ISD::DEC; 9516 NumOperands = 1; 9517 break; 9518 } 9519 } 9520 9521 // Otherwise use a regular EFLAGS-setting add. 9522 Opcode = X86ISD::ADD; 9523 NumOperands = 2; 9524 break; 9525 case ISD::AND: { 9526 // If the primary and result isn't used, don't bother using X86ISD::AND, 9527 // because a TEST instruction will be better. 9528 bool NonFlagUse = false; 9529 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9530 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 9531 SDNode *User = *UI; 9532 unsigned UOpNo = UI.getOperandNo(); 9533 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 9534 // Look pass truncate. 9535 UOpNo = User->use_begin().getOperandNo(); 9536 User = *User->use_begin(); 9537 } 9538 9539 if (User->getOpcode() != ISD::BRCOND && 9540 User->getOpcode() != ISD::SETCC && 9541 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 9542 NonFlagUse = true; 9543 break; 9544 } 9545 } 9546 9547 if (!NonFlagUse) 9548 break; 9549 } 9550 // FALL THROUGH 9551 case ISD::SUB: 9552 case ISD::OR: 9553 case ISD::XOR: 9554 // Due to the ISEL shortcoming noted above, be conservative if this op is 9555 // likely to be selected as part of a load-modify-store instruction. 9556 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9557 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9558 if (UI->getOpcode() == ISD::STORE) 9559 goto default_case; 9560 9561 // Otherwise use a regular EFLAGS-setting instruction. 9562 switch (ArithOp.getOpcode()) { 9563 default: llvm_unreachable("unexpected operator!"); 9564 case ISD::SUB: Opcode = X86ISD::SUB; break; 9565 case ISD::XOR: Opcode = X86ISD::XOR; break; 9566 case ISD::AND: Opcode = X86ISD::AND; break; 9567 case ISD::OR: { 9568 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 9569 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 9570 if (EFLAGS.getNode()) 9571 return EFLAGS; 9572 } 9573 Opcode = X86ISD::OR; 9574 break; 9575 } 9576 } 9577 9578 NumOperands = 2; 9579 break; 9580 case X86ISD::ADD: 9581 case X86ISD::SUB: 9582 case X86ISD::INC: 9583 case X86ISD::DEC: 9584 case X86ISD::OR: 9585 case X86ISD::XOR: 9586 case X86ISD::AND: 9587 return SDValue(Op.getNode(), 1); 9588 default: 9589 default_case: 9590 break; 9591 } 9592 9593 // If we found that truncation is beneficial, perform the truncation and 9594 // update 'Op'. 9595 if (NeedTruncation) { 9596 EVT VT = Op.getValueType(); 9597 SDValue WideVal = Op->getOperand(0); 9598 EVT WideVT = WideVal.getValueType(); 9599 unsigned ConvertedOp = 0; 9600 // Use a target machine opcode to prevent further DAGCombine 9601 // optimizations that may separate the arithmetic operations 9602 // from the setcc node. 9603 switch (WideVal.getOpcode()) { 9604 default: break; 9605 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 9606 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 9607 case ISD::AND: ConvertedOp = X86ISD::AND; break; 9608 case ISD::OR: ConvertedOp = X86ISD::OR; break; 9609 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 9610 } 9611 9612 if (ConvertedOp) { 9613 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9614 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 9615 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 9616 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 9617 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 9618 } 9619 } 9620 } 9621 9622 if (Opcode == 0) 9623 // Emit a CMP with 0, which is the TEST pattern. 9624 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9625 DAG.getConstant(0, Op.getValueType())); 9626 9627 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 9628 SmallVector<SDValue, 4> Ops; 9629 for (unsigned i = 0; i != NumOperands; ++i) 9630 Ops.push_back(Op.getOperand(i)); 9631 9632 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 9633 DAG.ReplaceAllUsesWith(Op, New); 9634 return SDValue(New.getNode(), 1); 9635} 9636 9637/// Emit nodes that will be selected as "cmp Op0,Op1", or something 9638/// equivalent. 9639SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 9640 SelectionDAG &DAG) const { 9641 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 9642 if (C->getAPIntValue() == 0) 9643 return EmitTest(Op0, X86CC, DAG); 9644 9645 SDLoc dl(Op0); 9646 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 9647 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 9648 // Use SUB instead of CMP to enable CSE between SUB and CMP. 9649 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 9650 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 9651 Op0, Op1); 9652 return SDValue(Sub.getNode(), 1); 9653 } 9654 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 9655} 9656 9657/// Convert a comparison if required by the subtarget. 9658SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 9659 SelectionDAG &DAG) const { 9660 // If the subtarget does not support the FUCOMI instruction, floating-point 9661 // comparisons have to be converted. 9662 if (Subtarget->hasCMov() || 9663 Cmp.getOpcode() != X86ISD::CMP || 9664 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 9665 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 9666 return Cmp; 9667 9668 // The instruction selector will select an FUCOM instruction instead of 9669 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 9670 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 9671 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 9672 SDLoc dl(Cmp); 9673 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 9674 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 9675 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 9676 DAG.getConstant(8, MVT::i8)); 9677 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 9678 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 9679} 9680 9681static bool isAllOnes(SDValue V) { 9682 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 9683 return C && C->isAllOnesValue(); 9684} 9685 9686/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 9687/// if it's possible. 9688SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 9689 SDLoc dl, SelectionDAG &DAG) const { 9690 SDValue Op0 = And.getOperand(0); 9691 SDValue Op1 = And.getOperand(1); 9692 if (Op0.getOpcode() == ISD::TRUNCATE) 9693 Op0 = Op0.getOperand(0); 9694 if (Op1.getOpcode() == ISD::TRUNCATE) 9695 Op1 = Op1.getOperand(0); 9696 9697 SDValue LHS, RHS; 9698 if (Op1.getOpcode() == ISD::SHL) 9699 std::swap(Op0, Op1); 9700 if (Op0.getOpcode() == ISD::SHL) { 9701 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 9702 if (And00C->getZExtValue() == 1) { 9703 // If we looked past a truncate, check that it's only truncating away 9704 // known zeros. 9705 unsigned BitWidth = Op0.getValueSizeInBits(); 9706 unsigned AndBitWidth = And.getValueSizeInBits(); 9707 if (BitWidth > AndBitWidth) { 9708 APInt Zeros, Ones; 9709 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 9710 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 9711 return SDValue(); 9712 } 9713 LHS = Op1; 9714 RHS = Op0.getOperand(1); 9715 } 9716 } else if (Op1.getOpcode() == ISD::Constant) { 9717 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 9718 uint64_t AndRHSVal = AndRHS->getZExtValue(); 9719 SDValue AndLHS = Op0; 9720 9721 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 9722 LHS = AndLHS.getOperand(0); 9723 RHS = AndLHS.getOperand(1); 9724 } 9725 9726 // Use BT if the immediate can't be encoded in a TEST instruction. 9727 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 9728 LHS = AndLHS; 9729 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 9730 } 9731 } 9732 9733 if (LHS.getNode()) { 9734 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 9735 // instruction. Since the shift amount is in-range-or-undefined, we know 9736 // that doing a bittest on the i32 value is ok. We extend to i32 because 9737 // the encoding for the i16 version is larger than the i32 version. 9738 // Also promote i16 to i32 for performance / code size reason. 9739 if (LHS.getValueType() == MVT::i8 || 9740 LHS.getValueType() == MVT::i16) 9741 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 9742 9743 // If the operand types disagree, extend the shift amount to match. Since 9744 // BT ignores high bits (like shifts) we can use anyextend. 9745 if (LHS.getValueType() != RHS.getValueType()) 9746 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 9747 9748 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 9749 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 9750 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9751 DAG.getConstant(Cond, MVT::i8), BT); 9752 } 9753 9754 return SDValue(); 9755} 9756 9757/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 9758/// mask CMPs. 9759static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 9760 SDValue &Op1) { 9761 unsigned SSECC; 9762 bool Swap = false; 9763 9764 // SSE Condition code mapping: 9765 // 0 - EQ 9766 // 1 - LT 9767 // 2 - LE 9768 // 3 - UNORD 9769 // 4 - NEQ 9770 // 5 - NLT 9771 // 6 - NLE 9772 // 7 - ORD 9773 switch (SetCCOpcode) { 9774 default: llvm_unreachable("Unexpected SETCC condition"); 9775 case ISD::SETOEQ: 9776 case ISD::SETEQ: SSECC = 0; break; 9777 case ISD::SETOGT: 9778 case ISD::SETGT: Swap = true; // Fallthrough 9779 case ISD::SETLT: 9780 case ISD::SETOLT: SSECC = 1; break; 9781 case ISD::SETOGE: 9782 case ISD::SETGE: Swap = true; // Fallthrough 9783 case ISD::SETLE: 9784 case ISD::SETOLE: SSECC = 2; break; 9785 case ISD::SETUO: SSECC = 3; break; 9786 case ISD::SETUNE: 9787 case ISD::SETNE: SSECC = 4; break; 9788 case ISD::SETULE: Swap = true; // Fallthrough 9789 case ISD::SETUGE: SSECC = 5; break; 9790 case ISD::SETULT: Swap = true; // Fallthrough 9791 case ISD::SETUGT: SSECC = 6; break; 9792 case ISD::SETO: SSECC = 7; break; 9793 case ISD::SETUEQ: 9794 case ISD::SETONE: SSECC = 8; break; 9795 } 9796 if (Swap) 9797 std::swap(Op0, Op1); 9798 9799 return SSECC; 9800} 9801 9802// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 9803// ones, and then concatenate the result back. 9804static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 9805 MVT VT = Op.getSimpleValueType(); 9806 9807 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 9808 "Unsupported value type for operation"); 9809 9810 unsigned NumElems = VT.getVectorNumElements(); 9811 SDLoc dl(Op); 9812 SDValue CC = Op.getOperand(2); 9813 9814 // Extract the LHS vectors 9815 SDValue LHS = Op.getOperand(0); 9816 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 9817 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 9818 9819 // Extract the RHS vectors 9820 SDValue RHS = Op.getOperand(1); 9821 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 9822 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 9823 9824 // Issue the operation on the smaller types and concatenate the result back 9825 MVT EltVT = VT.getVectorElementType(); 9826 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9827 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9828 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 9829 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 9830} 9831 9832static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 9833 SDValue Op0 = Op.getOperand(0); 9834 SDValue Op1 = Op.getOperand(1); 9835 SDValue CC = Op.getOperand(2); 9836 MVT VT = Op.getSimpleValueType(); 9837 9838 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && 9839 Op.getValueType().getScalarType() == MVT::i1 && 9840 "Cannot set masked compare for this operation"); 9841 9842 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9843 SDLoc dl(Op); 9844 9845 bool Unsigned = false; 9846 unsigned SSECC; 9847 switch (SetCCOpcode) { 9848 default: llvm_unreachable("Unexpected SETCC condition"); 9849 case ISD::SETNE: SSECC = 4; break; 9850 case ISD::SETEQ: SSECC = 0; break; 9851 case ISD::SETUGT: Unsigned = true; 9852 case ISD::SETGT: SSECC = 6; break; // NLE 9853 case ISD::SETULT: Unsigned = true; 9854 case ISD::SETLT: SSECC = 1; break; 9855 case ISD::SETUGE: Unsigned = true; 9856 case ISD::SETGE: SSECC = 5; break; // NLT 9857 case ISD::SETULE: Unsigned = true; 9858 case ISD::SETLE: SSECC = 2; break; 9859 } 9860 unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 9861 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9862 DAG.getConstant(SSECC, MVT::i8)); 9863 9864} 9865 9866static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 9867 SelectionDAG &DAG) { 9868 SDValue Op0 = Op.getOperand(0); 9869 SDValue Op1 = Op.getOperand(1); 9870 SDValue CC = Op.getOperand(2); 9871 MVT VT = Op.getSimpleValueType(); 9872 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9873 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 9874 SDLoc dl(Op); 9875 9876 if (isFP) { 9877#ifndef NDEBUG 9878 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 9879 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 9880#endif 9881 9882 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 9883 unsigned Opc = X86ISD::CMPP; 9884 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 9885 assert(VT.getVectorNumElements() <= 16); 9886 Opc = X86ISD::CMPM; 9887 } 9888 // In the two special cases we can't handle, emit two comparisons. 9889 if (SSECC == 8) { 9890 unsigned CC0, CC1; 9891 unsigned CombineOpc; 9892 if (SetCCOpcode == ISD::SETUEQ) { 9893 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 9894 } else { 9895 assert(SetCCOpcode == ISD::SETONE); 9896 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 9897 } 9898 9899 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9900 DAG.getConstant(CC0, MVT::i8)); 9901 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9902 DAG.getConstant(CC1, MVT::i8)); 9903 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 9904 } 9905 // Handle all other FP comparisons here. 9906 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9907 DAG.getConstant(SSECC, MVT::i8)); 9908 } 9909 9910 // Break 256-bit integer vector compare into smaller ones. 9911 if (VT.is256BitVector() && !Subtarget->hasInt256()) 9912 return Lower256IntVSETCC(Op, DAG); 9913 9914 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 9915 EVT OpVT = Op1.getValueType(); 9916 if (Subtarget->hasAVX512()) { 9917 if (Op1.getValueType().is512BitVector() || 9918 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 9919 return LowerIntVSETCC_AVX512(Op, DAG); 9920 9921 // In AVX-512 architecture setcc returns mask with i1 elements, 9922 // But there is no compare instruction for i8 and i16 elements. 9923 // We are not talking about 512-bit operands in this case, these 9924 // types are illegal. 9925 if (MaskResult && 9926 (OpVT.getVectorElementType().getSizeInBits() < 32 && 9927 OpVT.getVectorElementType().getSizeInBits() >= 8)) 9928 return DAG.getNode(ISD::TRUNCATE, dl, VT, 9929 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 9930 } 9931 9932 // We are handling one of the integer comparisons here. Since SSE only has 9933 // GT and EQ comparisons for integer, swapping operands and multiple 9934 // operations may be required for some comparisons. 9935 unsigned Opc; 9936 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 9937 9938 switch (SetCCOpcode) { 9939 default: llvm_unreachable("Unexpected SETCC condition"); 9940 case ISD::SETNE: Invert = true; 9941 case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; 9942 case ISD::SETLT: Swap = true; 9943 case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; 9944 case ISD::SETGE: Swap = true; 9945 case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9946 Invert = true; break; 9947 case ISD::SETULT: Swap = true; 9948 case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9949 FlipSigns = true; break; 9950 case ISD::SETUGE: Swap = true; 9951 case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9952 FlipSigns = true; Invert = true; break; 9953 } 9954 9955 // Special case: Use min/max operations for SETULE/SETUGE 9956 MVT VET = VT.getVectorElementType(); 9957 bool hasMinMax = 9958 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 9959 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 9960 9961 if (hasMinMax) { 9962 switch (SetCCOpcode) { 9963 default: break; 9964 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; 9965 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; 9966 } 9967 9968 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 9969 } 9970 9971 if (Swap) 9972 std::swap(Op0, Op1); 9973 9974 // Check that the operation in question is available (most are plain SSE2, 9975 // but PCMPGTQ and PCMPEQQ have different requirements). 9976 if (VT == MVT::v2i64) { 9977 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 9978 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 9979 9980 // First cast everything to the right type. 9981 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 9982 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 9983 9984 // Since SSE has no unsigned integer comparisons, we need to flip the sign 9985 // bits of the inputs before performing those operations. The lower 9986 // compare is always unsigned. 9987 SDValue SB; 9988 if (FlipSigns) { 9989 SB = DAG.getConstant(0x80000000U, MVT::v4i32); 9990 } else { 9991 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); 9992 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); 9993 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 9994 Sign, Zero, Sign, Zero); 9995 } 9996 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 9997 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 9998 9999 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 10000 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 10001 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 10002 10003 // Create masks for only the low parts/high parts of the 64 bit integers. 10004 static const int MaskHi[] = { 1, 1, 3, 3 }; 10005 static const int MaskLo[] = { 0, 0, 2, 2 }; 10006 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 10007 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 10008 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 10009 10010 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 10011 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 10012 10013 if (Invert) 10014 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10015 10016 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10017 } 10018 10019 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 10020 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 10021 // pcmpeqd + pshufd + pand. 10022 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 10023 10024 // First cast everything to the right type. 10025 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 10026 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 10027 10028 // Do the compare. 10029 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 10030 10031 // Make sure the lower and upper halves are both all-ones. 10032 static const int Mask[] = { 1, 0, 3, 2 }; 10033 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 10034 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 10035 10036 if (Invert) 10037 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10038 10039 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10040 } 10041 } 10042 10043 // Since SSE has no unsigned integer comparisons, we need to flip the sign 10044 // bits of the inputs before performing those operations. 10045 if (FlipSigns) { 10046 EVT EltVT = VT.getVectorElementType(); 10047 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); 10048 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 10049 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 10050 } 10051 10052 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 10053 10054 // If the logical-not of the result is required, perform that now. 10055 if (Invert) 10056 Result = DAG.getNOT(dl, Result, VT); 10057 10058 if (MinMax) 10059 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 10060 10061 return Result; 10062} 10063 10064SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 10065 10066 MVT VT = Op.getSimpleValueType(); 10067 10068 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 10069 10070 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); 10071 SDValue Op0 = Op.getOperand(0); 10072 SDValue Op1 = Op.getOperand(1); 10073 SDLoc dl(Op); 10074 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 10075 10076 // Optimize to BT if possible. 10077 // Lower (X & (1 << N)) == 0 to BT(X, N). 10078 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 10079 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 10080 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 10081 Op1.getOpcode() == ISD::Constant && 10082 cast<ConstantSDNode>(Op1)->isNullValue() && 10083 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10084 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 10085 if (NewSetCC.getNode()) 10086 return NewSetCC; 10087 } 10088 10089 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 10090 // these. 10091 if (Op1.getOpcode() == ISD::Constant && 10092 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 10093 cast<ConstantSDNode>(Op1)->isNullValue()) && 10094 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10095 10096 // If the input is a setcc, then reuse the input setcc or use a new one with 10097 // the inverted condition. 10098 if (Op0.getOpcode() == X86ISD::SETCC) { 10099 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 10100 bool Invert = (CC == ISD::SETNE) ^ 10101 cast<ConstantSDNode>(Op1)->isNullValue(); 10102 if (!Invert) return Op0; 10103 10104 CCode = X86::GetOppositeBranchCondition(CCode); 10105 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10106 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 10107 } 10108 } 10109 10110 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 10111 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 10112 if (X86CC == X86::COND_INVALID) 10113 return SDValue(); 10114 10115 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 10116 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 10117 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10118 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 10119} 10120 10121// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 10122static bool isX86LogicalCmp(SDValue Op) { 10123 unsigned Opc = Op.getNode()->getOpcode(); 10124 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 10125 Opc == X86ISD::SAHF) 10126 return true; 10127 if (Op.getResNo() == 1 && 10128 (Opc == X86ISD::ADD || 10129 Opc == X86ISD::SUB || 10130 Opc == X86ISD::ADC || 10131 Opc == X86ISD::SBB || 10132 Opc == X86ISD::SMUL || 10133 Opc == X86ISD::UMUL || 10134 Opc == X86ISD::INC || 10135 Opc == X86ISD::DEC || 10136 Opc == X86ISD::OR || 10137 Opc == X86ISD::XOR || 10138 Opc == X86ISD::AND)) 10139 return true; 10140 10141 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 10142 return true; 10143 10144 return false; 10145} 10146 10147static bool isZero(SDValue V) { 10148 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 10149 return C && C->isNullValue(); 10150} 10151 10152static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 10153 if (V.getOpcode() != ISD::TRUNCATE) 10154 return false; 10155 10156 SDValue VOp0 = V.getOperand(0); 10157 unsigned InBits = VOp0.getValueSizeInBits(); 10158 unsigned Bits = V.getValueSizeInBits(); 10159 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 10160} 10161 10162SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 10163 bool addTest = true; 10164 SDValue Cond = Op.getOperand(0); 10165 SDValue Op1 = Op.getOperand(1); 10166 SDValue Op2 = Op.getOperand(2); 10167 SDLoc DL(Op); 10168 EVT VT = Op1.getValueType(); 10169 SDValue CC; 10170 10171 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 10172 // are available. Otherwise fp cmovs get lowered into a less efficient branch 10173 // sequence later on. 10174 if (Cond.getOpcode() == ISD::SETCC && 10175 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 10176 (Subtarget->hasSSE1() && VT == MVT::f32)) && 10177 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { 10178 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 10179 int SSECC = translateX86FSETCC( 10180 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 10181 10182 if (SSECC != 8) { 10183 unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; 10184 SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, 10185 DAG.getConstant(SSECC, MVT::i8)); 10186 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 10187 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 10188 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 10189 } 10190 } 10191 10192 if (Cond.getOpcode() == ISD::SETCC) { 10193 SDValue NewCond = LowerSETCC(Cond, DAG); 10194 if (NewCond.getNode()) 10195 Cond = NewCond; 10196 } 10197 10198 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 10199 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 10200 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 10201 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 10202 if (Cond.getOpcode() == X86ISD::SETCC && 10203 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 10204 isZero(Cond.getOperand(1).getOperand(1))) { 10205 SDValue Cmp = Cond.getOperand(1); 10206 10207 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 10208 10209 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 10210 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 10211 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 10212 10213 SDValue CmpOp0 = Cmp.getOperand(0); 10214 // Apply further optimizations for special cases 10215 // (select (x != 0), -1, 0) -> neg & sbb 10216 // (select (x == 0), 0, -1) -> neg & sbb 10217 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 10218 if (YC->isNullValue() && 10219 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 10220 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 10221 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 10222 DAG.getConstant(0, CmpOp0.getValueType()), 10223 CmpOp0); 10224 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10225 DAG.getConstant(X86::COND_B, MVT::i8), 10226 SDValue(Neg.getNode(), 1)); 10227 return Res; 10228 } 10229 10230 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 10231 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 10232 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10233 10234 SDValue Res = // Res = 0 or -1. 10235 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10236 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 10237 10238 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 10239 Res = DAG.getNOT(DL, Res, Res.getValueType()); 10240 10241 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 10242 if (N2C == 0 || !N2C->isNullValue()) 10243 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 10244 return Res; 10245 } 10246 } 10247 10248 // Look past (and (setcc_carry (cmp ...)), 1). 10249 if (Cond.getOpcode() == ISD::AND && 10250 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10251 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10252 if (C && C->getAPIntValue() == 1) 10253 Cond = Cond.getOperand(0); 10254 } 10255 10256 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10257 // setting operand in place of the X86ISD::SETCC. 10258 unsigned CondOpcode = Cond.getOpcode(); 10259 if (CondOpcode == X86ISD::SETCC || 10260 CondOpcode == X86ISD::SETCC_CARRY) { 10261 CC = Cond.getOperand(0); 10262 10263 SDValue Cmp = Cond.getOperand(1); 10264 unsigned Opc = Cmp.getOpcode(); 10265 MVT VT = Op.getSimpleValueType(); 10266 10267 bool IllegalFPCMov = false; 10268 if (VT.isFloatingPoint() && !VT.isVector() && 10269 !isScalarFPTypeInSSEReg(VT)) // FPStack? 10270 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 10271 10272 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 10273 Opc == X86ISD::BT) { // FIXME 10274 Cond = Cmp; 10275 addTest = false; 10276 } 10277 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10278 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10279 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10280 Cond.getOperand(0).getValueType() != MVT::i8)) { 10281 SDValue LHS = Cond.getOperand(0); 10282 SDValue RHS = Cond.getOperand(1); 10283 unsigned X86Opcode; 10284 unsigned X86Cond; 10285 SDVTList VTs; 10286 switch (CondOpcode) { 10287 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10288 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10289 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10290 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10291 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10292 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10293 default: llvm_unreachable("unexpected overflowing operator"); 10294 } 10295 if (CondOpcode == ISD::UMULO) 10296 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10297 MVT::i32); 10298 else 10299 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10300 10301 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 10302 10303 if (CondOpcode == ISD::UMULO) 10304 Cond = X86Op.getValue(2); 10305 else 10306 Cond = X86Op.getValue(1); 10307 10308 CC = DAG.getConstant(X86Cond, MVT::i8); 10309 addTest = false; 10310 } 10311 10312 if (addTest) { 10313 // Look pass the truncate if the high bits are known zero. 10314 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10315 Cond = Cond.getOperand(0); 10316 10317 // We know the result of AND is compared against zero. Try to match 10318 // it to BT. 10319 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10320 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 10321 if (NewSetCC.getNode()) { 10322 CC = NewSetCC.getOperand(0); 10323 Cond = NewSetCC.getOperand(1); 10324 addTest = false; 10325 } 10326 } 10327 } 10328 10329 if (addTest) { 10330 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10331 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10332 } 10333 10334 // a < b ? -1 : 0 -> RES = ~setcc_carry 10335 // a < b ? 0 : -1 -> RES = setcc_carry 10336 // a >= b ? -1 : 0 -> RES = setcc_carry 10337 // a >= b ? 0 : -1 -> RES = ~setcc_carry 10338 if (Cond.getOpcode() == X86ISD::SUB) { 10339 Cond = ConvertCmpIfNecessary(Cond, DAG); 10340 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 10341 10342 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 10343 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 10344 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10345 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 10346 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 10347 return DAG.getNOT(DL, Res, Res.getValueType()); 10348 return Res; 10349 } 10350 } 10351 10352 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 10353 // widen the cmov and push the truncate through. This avoids introducing a new 10354 // branch during isel and doesn't add any extensions. 10355 if (Op.getValueType() == MVT::i8 && 10356 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 10357 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 10358 if (T1.getValueType() == T2.getValueType() && 10359 // Blacklist CopyFromReg to avoid partial register stalls. 10360 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 10361 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 10362 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 10363 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 10364 } 10365 } 10366 10367 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 10368 // condition is true. 10369 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 10370 SDValue Ops[] = { Op2, Op1, CC, Cond }; 10371 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 10372} 10373 10374static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { 10375 MVT VT = Op->getSimpleValueType(0); 10376 SDValue In = Op->getOperand(0); 10377 MVT InVT = In.getSimpleValueType(); 10378 SDLoc dl(Op); 10379 10380 unsigned int NumElts = VT.getVectorNumElements(); 10381 if (NumElts != 8 && NumElts != 16) 10382 return SDValue(); 10383 10384 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 10385 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 10386 10387 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10388 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 10389 10390 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; 10391 Constant *C = ConstantInt::get(*DAG.getContext(), 10392 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); 10393 10394 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 10395 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 10396 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, 10397 MachinePointerInfo::getConstantPool(), 10398 false, false, false, Alignment); 10399 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); 10400 if (VT.is512BitVector()) 10401 return Brcst; 10402 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); 10403} 10404 10405static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 10406 SelectionDAG &DAG) { 10407 MVT VT = Op->getSimpleValueType(0); 10408 SDValue In = Op->getOperand(0); 10409 MVT InVT = In.getSimpleValueType(); 10410 SDLoc dl(Op); 10411 10412 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 10413 return LowerSIGN_EXTEND_AVX512(Op, DAG); 10414 10415 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 10416 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 10417 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 10418 return SDValue(); 10419 10420 if (Subtarget->hasInt256()) 10421 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); 10422 10423 // Optimize vectors in AVX mode 10424 // Sign extend v8i16 to v8i32 and 10425 // v4i32 to v4i64 10426 // 10427 // Divide input vector into two parts 10428 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 10429 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 10430 // concat the vectors to original VT 10431 10432 unsigned NumElems = InVT.getVectorNumElements(); 10433 SDValue Undef = DAG.getUNDEF(InVT); 10434 10435 SmallVector<int,8> ShufMask1(NumElems, -1); 10436 for (unsigned i = 0; i != NumElems/2; ++i) 10437 ShufMask1[i] = i; 10438 10439 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 10440 10441 SmallVector<int,8> ShufMask2(NumElems, -1); 10442 for (unsigned i = 0; i != NumElems/2; ++i) 10443 ShufMask2[i] = i + NumElems/2; 10444 10445 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 10446 10447 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), 10448 VT.getVectorNumElements()/2); 10449 10450 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 10451 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 10452 10453 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 10454} 10455 10456// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 10457// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 10458// from the AND / OR. 10459static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 10460 Opc = Op.getOpcode(); 10461 if (Opc != ISD::OR && Opc != ISD::AND) 10462 return false; 10463 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10464 Op.getOperand(0).hasOneUse() && 10465 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 10466 Op.getOperand(1).hasOneUse()); 10467} 10468 10469// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 10470// 1 and that the SETCC node has a single use. 10471static bool isXor1OfSetCC(SDValue Op) { 10472 if (Op.getOpcode() != ISD::XOR) 10473 return false; 10474 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 10475 if (N1C && N1C->getAPIntValue() == 1) { 10476 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10477 Op.getOperand(0).hasOneUse(); 10478 } 10479 return false; 10480} 10481 10482SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 10483 bool addTest = true; 10484 SDValue Chain = Op.getOperand(0); 10485 SDValue Cond = Op.getOperand(1); 10486 SDValue Dest = Op.getOperand(2); 10487 SDLoc dl(Op); 10488 SDValue CC; 10489 bool Inverted = false; 10490 10491 if (Cond.getOpcode() == ISD::SETCC) { 10492 // Check for setcc([su]{add,sub,mul}o == 0). 10493 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 10494 isa<ConstantSDNode>(Cond.getOperand(1)) && 10495 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 10496 Cond.getOperand(0).getResNo() == 1 && 10497 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 10498 Cond.getOperand(0).getOpcode() == ISD::UADDO || 10499 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 10500 Cond.getOperand(0).getOpcode() == ISD::USUBO || 10501 Cond.getOperand(0).getOpcode() == ISD::SMULO || 10502 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 10503 Inverted = true; 10504 Cond = Cond.getOperand(0); 10505 } else { 10506 SDValue NewCond = LowerSETCC(Cond, DAG); 10507 if (NewCond.getNode()) 10508 Cond = NewCond; 10509 } 10510 } 10511#if 0 10512 // FIXME: LowerXALUO doesn't handle these!! 10513 else if (Cond.getOpcode() == X86ISD::ADD || 10514 Cond.getOpcode() == X86ISD::SUB || 10515 Cond.getOpcode() == X86ISD::SMUL || 10516 Cond.getOpcode() == X86ISD::UMUL) 10517 Cond = LowerXALUO(Cond, DAG); 10518#endif 10519 10520 // Look pass (and (setcc_carry (cmp ...)), 1). 10521 if (Cond.getOpcode() == ISD::AND && 10522 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10523 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10524 if (C && C->getAPIntValue() == 1) 10525 Cond = Cond.getOperand(0); 10526 } 10527 10528 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10529 // setting operand in place of the X86ISD::SETCC. 10530 unsigned CondOpcode = Cond.getOpcode(); 10531 if (CondOpcode == X86ISD::SETCC || 10532 CondOpcode == X86ISD::SETCC_CARRY) { 10533 CC = Cond.getOperand(0); 10534 10535 SDValue Cmp = Cond.getOperand(1); 10536 unsigned Opc = Cmp.getOpcode(); 10537 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 10538 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 10539 Cond = Cmp; 10540 addTest = false; 10541 } else { 10542 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 10543 default: break; 10544 case X86::COND_O: 10545 case X86::COND_B: 10546 // These can only come from an arithmetic instruction with overflow, 10547 // e.g. SADDO, UADDO. 10548 Cond = Cond.getNode()->getOperand(1); 10549 addTest = false; 10550 break; 10551 } 10552 } 10553 } 10554 CondOpcode = Cond.getOpcode(); 10555 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10556 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10557 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10558 Cond.getOperand(0).getValueType() != MVT::i8)) { 10559 SDValue LHS = Cond.getOperand(0); 10560 SDValue RHS = Cond.getOperand(1); 10561 unsigned X86Opcode; 10562 unsigned X86Cond; 10563 SDVTList VTs; 10564 switch (CondOpcode) { 10565 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10566 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10567 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10568 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10569 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10570 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10571 default: llvm_unreachable("unexpected overflowing operator"); 10572 } 10573 if (Inverted) 10574 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 10575 if (CondOpcode == ISD::UMULO) 10576 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10577 MVT::i32); 10578 else 10579 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10580 10581 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 10582 10583 if (CondOpcode == ISD::UMULO) 10584 Cond = X86Op.getValue(2); 10585 else 10586 Cond = X86Op.getValue(1); 10587 10588 CC = DAG.getConstant(X86Cond, MVT::i8); 10589 addTest = false; 10590 } else { 10591 unsigned CondOpc; 10592 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 10593 SDValue Cmp = Cond.getOperand(0).getOperand(1); 10594 if (CondOpc == ISD::OR) { 10595 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 10596 // two branches instead of an explicit OR instruction with a 10597 // separate test. 10598 if (Cmp == Cond.getOperand(1).getOperand(1) && 10599 isX86LogicalCmp(Cmp)) { 10600 CC = Cond.getOperand(0).getOperand(0); 10601 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10602 Chain, Dest, CC, Cmp); 10603 CC = Cond.getOperand(1).getOperand(0); 10604 Cond = Cmp; 10605 addTest = false; 10606 } 10607 } else { // ISD::AND 10608 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 10609 // two branches instead of an explicit AND instruction with a 10610 // separate test. However, we only do this if this block doesn't 10611 // have a fall-through edge, because this requires an explicit 10612 // jmp when the condition is false. 10613 if (Cmp == Cond.getOperand(1).getOperand(1) && 10614 isX86LogicalCmp(Cmp) && 10615 Op.getNode()->hasOneUse()) { 10616 X86::CondCode CCode = 10617 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10618 CCode = X86::GetOppositeBranchCondition(CCode); 10619 CC = DAG.getConstant(CCode, MVT::i8); 10620 SDNode *User = *Op.getNode()->use_begin(); 10621 // Look for an unconditional branch following this conditional branch. 10622 // We need this because we need to reverse the successors in order 10623 // to implement FCMP_OEQ. 10624 if (User->getOpcode() == ISD::BR) { 10625 SDValue FalseBB = User->getOperand(1); 10626 SDNode *NewBR = 10627 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10628 assert(NewBR == User); 10629 (void)NewBR; 10630 Dest = FalseBB; 10631 10632 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10633 Chain, Dest, CC, Cmp); 10634 X86::CondCode CCode = 10635 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 10636 CCode = X86::GetOppositeBranchCondition(CCode); 10637 CC = DAG.getConstant(CCode, MVT::i8); 10638 Cond = Cmp; 10639 addTest = false; 10640 } 10641 } 10642 } 10643 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 10644 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 10645 // It should be transformed during dag combiner except when the condition 10646 // is set by a arithmetics with overflow node. 10647 X86::CondCode CCode = 10648 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10649 CCode = X86::GetOppositeBranchCondition(CCode); 10650 CC = DAG.getConstant(CCode, MVT::i8); 10651 Cond = Cond.getOperand(0).getOperand(1); 10652 addTest = false; 10653 } else if (Cond.getOpcode() == ISD::SETCC && 10654 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 10655 // For FCMP_OEQ, we can emit 10656 // two branches instead of an explicit AND instruction with a 10657 // separate test. However, we only do this if this block doesn't 10658 // have a fall-through edge, because this requires an explicit 10659 // jmp when the condition is false. 10660 if (Op.getNode()->hasOneUse()) { 10661 SDNode *User = *Op.getNode()->use_begin(); 10662 // Look for an unconditional branch following this conditional branch. 10663 // We need this because we need to reverse the successors in order 10664 // to implement FCMP_OEQ. 10665 if (User->getOpcode() == ISD::BR) { 10666 SDValue FalseBB = User->getOperand(1); 10667 SDNode *NewBR = 10668 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10669 assert(NewBR == User); 10670 (void)NewBR; 10671 Dest = FalseBB; 10672 10673 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10674 Cond.getOperand(0), Cond.getOperand(1)); 10675 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10676 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10677 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10678 Chain, Dest, CC, Cmp); 10679 CC = DAG.getConstant(X86::COND_P, MVT::i8); 10680 Cond = Cmp; 10681 addTest = false; 10682 } 10683 } 10684 } else if (Cond.getOpcode() == ISD::SETCC && 10685 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 10686 // For FCMP_UNE, we can emit 10687 // two branches instead of an explicit AND instruction with a 10688 // separate test. However, we only do this if this block doesn't 10689 // have a fall-through edge, because this requires an explicit 10690 // jmp when the condition is false. 10691 if (Op.getNode()->hasOneUse()) { 10692 SDNode *User = *Op.getNode()->use_begin(); 10693 // Look for an unconditional branch following this conditional branch. 10694 // We need this because we need to reverse the successors in order 10695 // to implement FCMP_UNE. 10696 if (User->getOpcode() == ISD::BR) { 10697 SDValue FalseBB = User->getOperand(1); 10698 SDNode *NewBR = 10699 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10700 assert(NewBR == User); 10701 (void)NewBR; 10702 10703 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10704 Cond.getOperand(0), Cond.getOperand(1)); 10705 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10706 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10707 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10708 Chain, Dest, CC, Cmp); 10709 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 10710 Cond = Cmp; 10711 addTest = false; 10712 Dest = FalseBB; 10713 } 10714 } 10715 } 10716 } 10717 10718 if (addTest) { 10719 // Look pass the truncate if the high bits are known zero. 10720 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10721 Cond = Cond.getOperand(0); 10722 10723 // We know the result of AND is compared against zero. Try to match 10724 // it to BT. 10725 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10726 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 10727 if (NewSetCC.getNode()) { 10728 CC = NewSetCC.getOperand(0); 10729 Cond = NewSetCC.getOperand(1); 10730 addTest = false; 10731 } 10732 } 10733 } 10734 10735 if (addTest) { 10736 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10737 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10738 } 10739 Cond = ConvertCmpIfNecessary(Cond, DAG); 10740 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10741 Chain, Dest, CC, Cond); 10742} 10743 10744// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 10745// Calls to _alloca is needed to probe the stack when allocating more than 4k 10746// bytes in one go. Touching the stack at 4K increments is necessary to ensure 10747// that the guard pages used by the OS virtual memory manager are allocated in 10748// correct sequence. 10749SDValue 10750X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 10751 SelectionDAG &DAG) const { 10752 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 10753 getTargetMachine().Options.EnableSegmentedStacks) && 10754 "This should be used only on Windows targets or when segmented stacks " 10755 "are being used"); 10756 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 10757 SDLoc dl(Op); 10758 10759 // Get the inputs. 10760 SDValue Chain = Op.getOperand(0); 10761 SDValue Size = Op.getOperand(1); 10762 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10763 EVT VT = Op.getNode()->getValueType(0); 10764 10765 bool Is64Bit = Subtarget->is64Bit(); 10766 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 10767 10768 if (getTargetMachine().Options.EnableSegmentedStacks) { 10769 MachineFunction &MF = DAG.getMachineFunction(); 10770 MachineRegisterInfo &MRI = MF.getRegInfo(); 10771 10772 if (Is64Bit) { 10773 // The 64 bit implementation of segmented stacks needs to clobber both r10 10774 // r11. This makes it impossible to use it along with nested parameters. 10775 const Function *F = MF.getFunction(); 10776 10777 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 10778 I != E; ++I) 10779 if (I->hasNestAttr()) 10780 report_fatal_error("Cannot use segmented stacks with functions that " 10781 "have nested arguments."); 10782 } 10783 10784 const TargetRegisterClass *AddrRegClass = 10785 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 10786 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 10787 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 10788 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 10789 DAG.getRegister(Vreg, SPTy)); 10790 SDValue Ops1[2] = { Value, Chain }; 10791 return DAG.getMergeValues(Ops1, 2, dl); 10792 } else { 10793 SDValue Flag; 10794 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 10795 10796 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 10797 Flag = Chain.getValue(1); 10798 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 10799 10800 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 10801 10802 const X86RegisterInfo *RegInfo = 10803 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 10804 unsigned SPReg = RegInfo->getStackRegister(); 10805 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 10806 Chain = SP.getValue(1); 10807 10808 if (Align) { 10809 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 10810 DAG.getConstant(-(uint64_t)Align, VT)); 10811 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 10812 } 10813 10814 SDValue Ops1[2] = { SP, Chain }; 10815 return DAG.getMergeValues(Ops1, 2, dl); 10816 } 10817} 10818 10819SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 10820 MachineFunction &MF = DAG.getMachineFunction(); 10821 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 10822 10823 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10824 SDLoc DL(Op); 10825 10826 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 10827 // vastart just stores the address of the VarArgsFrameIndex slot into the 10828 // memory location argument. 10829 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10830 getPointerTy()); 10831 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 10832 MachinePointerInfo(SV), false, false, 0); 10833 } 10834 10835 // __va_list_tag: 10836 // gp_offset (0 - 6 * 8) 10837 // fp_offset (48 - 48 + 8 * 16) 10838 // overflow_arg_area (point to parameters coming in memory). 10839 // reg_save_area 10840 SmallVector<SDValue, 8> MemOps; 10841 SDValue FIN = Op.getOperand(1); 10842 // Store gp_offset 10843 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 10844 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 10845 MVT::i32), 10846 FIN, MachinePointerInfo(SV), false, false, 0); 10847 MemOps.push_back(Store); 10848 10849 // Store fp_offset 10850 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10851 FIN, DAG.getIntPtrConstant(4)); 10852 Store = DAG.getStore(Op.getOperand(0), DL, 10853 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 10854 MVT::i32), 10855 FIN, MachinePointerInfo(SV, 4), false, false, 0); 10856 MemOps.push_back(Store); 10857 10858 // Store ptr to overflow_arg_area 10859 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10860 FIN, DAG.getIntPtrConstant(4)); 10861 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10862 getPointerTy()); 10863 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 10864 MachinePointerInfo(SV, 8), 10865 false, false, 0); 10866 MemOps.push_back(Store); 10867 10868 // Store ptr to reg_save_area. 10869 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10870 FIN, DAG.getIntPtrConstant(8)); 10871 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 10872 getPointerTy()); 10873 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 10874 MachinePointerInfo(SV, 16), false, false, 0); 10875 MemOps.push_back(Store); 10876 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 10877 &MemOps[0], MemOps.size()); 10878} 10879 10880SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 10881 assert(Subtarget->is64Bit() && 10882 "LowerVAARG only handles 64-bit va_arg!"); 10883 assert((Subtarget->isTargetLinux() || 10884 Subtarget->isTargetDarwin()) && 10885 "Unhandled target in LowerVAARG"); 10886 assert(Op.getNode()->getNumOperands() == 4); 10887 SDValue Chain = Op.getOperand(0); 10888 SDValue SrcPtr = Op.getOperand(1); 10889 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10890 unsigned Align = Op.getConstantOperandVal(3); 10891 SDLoc dl(Op); 10892 10893 EVT ArgVT = Op.getNode()->getValueType(0); 10894 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 10895 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 10896 uint8_t ArgMode; 10897 10898 // Decide which area this value should be read from. 10899 // TODO: Implement the AMD64 ABI in its entirety. This simple 10900 // selection mechanism works only for the basic types. 10901 if (ArgVT == MVT::f80) { 10902 llvm_unreachable("va_arg for f80 not yet implemented"); 10903 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 10904 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 10905 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 10906 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 10907 } else { 10908 llvm_unreachable("Unhandled argument type in LowerVAARG"); 10909 } 10910 10911 if (ArgMode == 2) { 10912 // Sanity Check: Make sure using fp_offset makes sense. 10913 assert(!getTargetMachine().Options.UseSoftFloat && 10914 !(DAG.getMachineFunction() 10915 .getFunction()->getAttributes() 10916 .hasAttribute(AttributeSet::FunctionIndex, 10917 Attribute::NoImplicitFloat)) && 10918 Subtarget->hasSSE1()); 10919 } 10920 10921 // Insert VAARG_64 node into the DAG 10922 // VAARG_64 returns two values: Variable Argument Address, Chain 10923 SmallVector<SDValue, 11> InstOps; 10924 InstOps.push_back(Chain); 10925 InstOps.push_back(SrcPtr); 10926 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 10927 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 10928 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 10929 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 10930 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 10931 VTs, &InstOps[0], InstOps.size(), 10932 MVT::i64, 10933 MachinePointerInfo(SV), 10934 /*Align=*/0, 10935 /*Volatile=*/false, 10936 /*ReadMem=*/true, 10937 /*WriteMem=*/true); 10938 Chain = VAARG.getValue(1); 10939 10940 // Load the next argument and return it 10941 return DAG.getLoad(ArgVT, dl, 10942 Chain, 10943 VAARG, 10944 MachinePointerInfo(), 10945 false, false, false, 0); 10946} 10947 10948static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 10949 SelectionDAG &DAG) { 10950 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 10951 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 10952 SDValue Chain = Op.getOperand(0); 10953 SDValue DstPtr = Op.getOperand(1); 10954 SDValue SrcPtr = Op.getOperand(2); 10955 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 10956 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10957 SDLoc DL(Op); 10958 10959 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 10960 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 10961 false, 10962 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 10963} 10964 10965// getTargetVShiftByConstNode - Handle vector element shifts where the shift 10966// amount is a constant. Takes immediate version of shift as input. 10967static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, 10968 SDValue SrcOp, uint64_t ShiftAmt, 10969 SelectionDAG &DAG) { 10970 10971 // Check for ShiftAmt >= element width 10972 if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { 10973 if (Opc == X86ISD::VSRAI) 10974 ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; 10975 else 10976 return DAG.getConstant(0, VT); 10977 } 10978 10979 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 10980 && "Unknown target vector shift-by-constant node"); 10981 10982 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); 10983} 10984 10985// getTargetVShiftNode - Handle vector element shifts where the shift amount 10986// may or may not be a constant. Takes immediate version of shift as input. 10987static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, 10988 SDValue SrcOp, SDValue ShAmt, 10989 SelectionDAG &DAG) { 10990 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 10991 10992 // Catch shift-by-constant. 10993 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 10994 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 10995 CShAmt->getZExtValue(), DAG); 10996 10997 // Change opcode to non-immediate version 10998 switch (Opc) { 10999 default: llvm_unreachable("Unknown target vector shift node"); 11000 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 11001 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 11002 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 11003 } 11004 11005 // Need to build a vector containing shift amount 11006 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 11007 SDValue ShOps[4]; 11008 ShOps[0] = ShAmt; 11009 ShOps[1] = DAG.getConstant(0, MVT::i32); 11010 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 11011 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 11012 11013 // The return type has to be a 128-bit type with the same element 11014 // type as the input type. 11015 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11016 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 11017 11018 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 11019 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 11020} 11021 11022static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 11023 SDLoc dl(Op); 11024 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11025 switch (IntNo) { 11026 default: return SDValue(); // Don't custom lower most intrinsics. 11027 // Comparison intrinsics. 11028 case Intrinsic::x86_sse_comieq_ss: 11029 case Intrinsic::x86_sse_comilt_ss: 11030 case Intrinsic::x86_sse_comile_ss: 11031 case Intrinsic::x86_sse_comigt_ss: 11032 case Intrinsic::x86_sse_comige_ss: 11033 case Intrinsic::x86_sse_comineq_ss: 11034 case Intrinsic::x86_sse_ucomieq_ss: 11035 case Intrinsic::x86_sse_ucomilt_ss: 11036 case Intrinsic::x86_sse_ucomile_ss: 11037 case Intrinsic::x86_sse_ucomigt_ss: 11038 case Intrinsic::x86_sse_ucomige_ss: 11039 case Intrinsic::x86_sse_ucomineq_ss: 11040 case Intrinsic::x86_sse2_comieq_sd: 11041 case Intrinsic::x86_sse2_comilt_sd: 11042 case Intrinsic::x86_sse2_comile_sd: 11043 case Intrinsic::x86_sse2_comigt_sd: 11044 case Intrinsic::x86_sse2_comige_sd: 11045 case Intrinsic::x86_sse2_comineq_sd: 11046 case Intrinsic::x86_sse2_ucomieq_sd: 11047 case Intrinsic::x86_sse2_ucomilt_sd: 11048 case Intrinsic::x86_sse2_ucomile_sd: 11049 case Intrinsic::x86_sse2_ucomigt_sd: 11050 case Intrinsic::x86_sse2_ucomige_sd: 11051 case Intrinsic::x86_sse2_ucomineq_sd: { 11052 unsigned Opc; 11053 ISD::CondCode CC; 11054 switch (IntNo) { 11055 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11056 case Intrinsic::x86_sse_comieq_ss: 11057 case Intrinsic::x86_sse2_comieq_sd: 11058 Opc = X86ISD::COMI; 11059 CC = ISD::SETEQ; 11060 break; 11061 case Intrinsic::x86_sse_comilt_ss: 11062 case Intrinsic::x86_sse2_comilt_sd: 11063 Opc = X86ISD::COMI; 11064 CC = ISD::SETLT; 11065 break; 11066 case Intrinsic::x86_sse_comile_ss: 11067 case Intrinsic::x86_sse2_comile_sd: 11068 Opc = X86ISD::COMI; 11069 CC = ISD::SETLE; 11070 break; 11071 case Intrinsic::x86_sse_comigt_ss: 11072 case Intrinsic::x86_sse2_comigt_sd: 11073 Opc = X86ISD::COMI; 11074 CC = ISD::SETGT; 11075 break; 11076 case Intrinsic::x86_sse_comige_ss: 11077 case Intrinsic::x86_sse2_comige_sd: 11078 Opc = X86ISD::COMI; 11079 CC = ISD::SETGE; 11080 break; 11081 case Intrinsic::x86_sse_comineq_ss: 11082 case Intrinsic::x86_sse2_comineq_sd: 11083 Opc = X86ISD::COMI; 11084 CC = ISD::SETNE; 11085 break; 11086 case Intrinsic::x86_sse_ucomieq_ss: 11087 case Intrinsic::x86_sse2_ucomieq_sd: 11088 Opc = X86ISD::UCOMI; 11089 CC = ISD::SETEQ; 11090 break; 11091 case Intrinsic::x86_sse_ucomilt_ss: 11092 case Intrinsic::x86_sse2_ucomilt_sd: 11093 Opc = X86ISD::UCOMI; 11094 CC = ISD::SETLT; 11095 break; 11096 case Intrinsic::x86_sse_ucomile_ss: 11097 case Intrinsic::x86_sse2_ucomile_sd: 11098 Opc = X86ISD::UCOMI; 11099 CC = ISD::SETLE; 11100 break; 11101 case Intrinsic::x86_sse_ucomigt_ss: 11102 case Intrinsic::x86_sse2_ucomigt_sd: 11103 Opc = X86ISD::UCOMI; 11104 CC = ISD::SETGT; 11105 break; 11106 case Intrinsic::x86_sse_ucomige_ss: 11107 case Intrinsic::x86_sse2_ucomige_sd: 11108 Opc = X86ISD::UCOMI; 11109 CC = ISD::SETGE; 11110 break; 11111 case Intrinsic::x86_sse_ucomineq_ss: 11112 case Intrinsic::x86_sse2_ucomineq_sd: 11113 Opc = X86ISD::UCOMI; 11114 CC = ISD::SETNE; 11115 break; 11116 } 11117 11118 SDValue LHS = Op.getOperand(1); 11119 SDValue RHS = Op.getOperand(2); 11120 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 11121 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 11122 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 11123 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11124 DAG.getConstant(X86CC, MVT::i8), Cond); 11125 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11126 } 11127 11128 // Arithmetic intrinsics. 11129 case Intrinsic::x86_sse2_pmulu_dq: 11130 case Intrinsic::x86_avx2_pmulu_dq: 11131 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 11132 Op.getOperand(1), Op.getOperand(2)); 11133 11134 // SSE2/AVX2 sub with unsigned saturation intrinsics 11135 case Intrinsic::x86_sse2_psubus_b: 11136 case Intrinsic::x86_sse2_psubus_w: 11137 case Intrinsic::x86_avx2_psubus_b: 11138 case Intrinsic::x86_avx2_psubus_w: 11139 return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), 11140 Op.getOperand(1), Op.getOperand(2)); 11141 11142 // SSE3/AVX horizontal add/sub intrinsics 11143 case Intrinsic::x86_sse3_hadd_ps: 11144 case Intrinsic::x86_sse3_hadd_pd: 11145 case Intrinsic::x86_avx_hadd_ps_256: 11146 case Intrinsic::x86_avx_hadd_pd_256: 11147 case Intrinsic::x86_sse3_hsub_ps: 11148 case Intrinsic::x86_sse3_hsub_pd: 11149 case Intrinsic::x86_avx_hsub_ps_256: 11150 case Intrinsic::x86_avx_hsub_pd_256: 11151 case Intrinsic::x86_ssse3_phadd_w_128: 11152 case Intrinsic::x86_ssse3_phadd_d_128: 11153 case Intrinsic::x86_avx2_phadd_w: 11154 case Intrinsic::x86_avx2_phadd_d: 11155 case Intrinsic::x86_ssse3_phsub_w_128: 11156 case Intrinsic::x86_ssse3_phsub_d_128: 11157 case Intrinsic::x86_avx2_phsub_w: 11158 case Intrinsic::x86_avx2_phsub_d: { 11159 unsigned Opcode; 11160 switch (IntNo) { 11161 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11162 case Intrinsic::x86_sse3_hadd_ps: 11163 case Intrinsic::x86_sse3_hadd_pd: 11164 case Intrinsic::x86_avx_hadd_ps_256: 11165 case Intrinsic::x86_avx_hadd_pd_256: 11166 Opcode = X86ISD::FHADD; 11167 break; 11168 case Intrinsic::x86_sse3_hsub_ps: 11169 case Intrinsic::x86_sse3_hsub_pd: 11170 case Intrinsic::x86_avx_hsub_ps_256: 11171 case Intrinsic::x86_avx_hsub_pd_256: 11172 Opcode = X86ISD::FHSUB; 11173 break; 11174 case Intrinsic::x86_ssse3_phadd_w_128: 11175 case Intrinsic::x86_ssse3_phadd_d_128: 11176 case Intrinsic::x86_avx2_phadd_w: 11177 case Intrinsic::x86_avx2_phadd_d: 11178 Opcode = X86ISD::HADD; 11179 break; 11180 case Intrinsic::x86_ssse3_phsub_w_128: 11181 case Intrinsic::x86_ssse3_phsub_d_128: 11182 case Intrinsic::x86_avx2_phsub_w: 11183 case Intrinsic::x86_avx2_phsub_d: 11184 Opcode = X86ISD::HSUB; 11185 break; 11186 } 11187 return DAG.getNode(Opcode, dl, Op.getValueType(), 11188 Op.getOperand(1), Op.getOperand(2)); 11189 } 11190 11191 // SSE2/SSE41/AVX2 integer max/min intrinsics. 11192 case Intrinsic::x86_sse2_pmaxu_b: 11193 case Intrinsic::x86_sse41_pmaxuw: 11194 case Intrinsic::x86_sse41_pmaxud: 11195 case Intrinsic::x86_avx2_pmaxu_b: 11196 case Intrinsic::x86_avx2_pmaxu_w: 11197 case Intrinsic::x86_avx2_pmaxu_d: 11198 case Intrinsic::x86_avx512_pmaxu_d: 11199 case Intrinsic::x86_avx512_pmaxu_q: 11200 case Intrinsic::x86_sse2_pminu_b: 11201 case Intrinsic::x86_sse41_pminuw: 11202 case Intrinsic::x86_sse41_pminud: 11203 case Intrinsic::x86_avx2_pminu_b: 11204 case Intrinsic::x86_avx2_pminu_w: 11205 case Intrinsic::x86_avx2_pminu_d: 11206 case Intrinsic::x86_avx512_pminu_d: 11207 case Intrinsic::x86_avx512_pminu_q: 11208 case Intrinsic::x86_sse41_pmaxsb: 11209 case Intrinsic::x86_sse2_pmaxs_w: 11210 case Intrinsic::x86_sse41_pmaxsd: 11211 case Intrinsic::x86_avx2_pmaxs_b: 11212 case Intrinsic::x86_avx2_pmaxs_w: 11213 case Intrinsic::x86_avx2_pmaxs_d: 11214 case Intrinsic::x86_avx512_pmaxs_d: 11215 case Intrinsic::x86_avx512_pmaxs_q: 11216 case Intrinsic::x86_sse41_pminsb: 11217 case Intrinsic::x86_sse2_pmins_w: 11218 case Intrinsic::x86_sse41_pminsd: 11219 case Intrinsic::x86_avx2_pmins_b: 11220 case Intrinsic::x86_avx2_pmins_w: 11221 case Intrinsic::x86_avx2_pmins_d: 11222 case Intrinsic::x86_avx512_pmins_d: 11223 case Intrinsic::x86_avx512_pmins_q: { 11224 unsigned Opcode; 11225 switch (IntNo) { 11226 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11227 case Intrinsic::x86_sse2_pmaxu_b: 11228 case Intrinsic::x86_sse41_pmaxuw: 11229 case Intrinsic::x86_sse41_pmaxud: 11230 case Intrinsic::x86_avx2_pmaxu_b: 11231 case Intrinsic::x86_avx2_pmaxu_w: 11232 case Intrinsic::x86_avx2_pmaxu_d: 11233 case Intrinsic::x86_avx512_pmaxu_d: 11234 case Intrinsic::x86_avx512_pmaxu_q: 11235 Opcode = X86ISD::UMAX; 11236 break; 11237 case Intrinsic::x86_sse2_pminu_b: 11238 case Intrinsic::x86_sse41_pminuw: 11239 case Intrinsic::x86_sse41_pminud: 11240 case Intrinsic::x86_avx2_pminu_b: 11241 case Intrinsic::x86_avx2_pminu_w: 11242 case Intrinsic::x86_avx2_pminu_d: 11243 case Intrinsic::x86_avx512_pminu_d: 11244 case Intrinsic::x86_avx512_pminu_q: 11245 Opcode = X86ISD::UMIN; 11246 break; 11247 case Intrinsic::x86_sse41_pmaxsb: 11248 case Intrinsic::x86_sse2_pmaxs_w: 11249 case Intrinsic::x86_sse41_pmaxsd: 11250 case Intrinsic::x86_avx2_pmaxs_b: 11251 case Intrinsic::x86_avx2_pmaxs_w: 11252 case Intrinsic::x86_avx2_pmaxs_d: 11253 case Intrinsic::x86_avx512_pmaxs_d: 11254 case Intrinsic::x86_avx512_pmaxs_q: 11255 Opcode = X86ISD::SMAX; 11256 break; 11257 case Intrinsic::x86_sse41_pminsb: 11258 case Intrinsic::x86_sse2_pmins_w: 11259 case Intrinsic::x86_sse41_pminsd: 11260 case Intrinsic::x86_avx2_pmins_b: 11261 case Intrinsic::x86_avx2_pmins_w: 11262 case Intrinsic::x86_avx2_pmins_d: 11263 case Intrinsic::x86_avx512_pmins_d: 11264 case Intrinsic::x86_avx512_pmins_q: 11265 Opcode = X86ISD::SMIN; 11266 break; 11267 } 11268 return DAG.getNode(Opcode, dl, Op.getValueType(), 11269 Op.getOperand(1), Op.getOperand(2)); 11270 } 11271 11272 // SSE/SSE2/AVX floating point max/min intrinsics. 11273 case Intrinsic::x86_sse_max_ps: 11274 case Intrinsic::x86_sse2_max_pd: 11275 case Intrinsic::x86_avx_max_ps_256: 11276 case Intrinsic::x86_avx_max_pd_256: 11277 case Intrinsic::x86_avx512_max_ps_512: 11278 case Intrinsic::x86_avx512_max_pd_512: 11279 case Intrinsic::x86_sse_min_ps: 11280 case Intrinsic::x86_sse2_min_pd: 11281 case Intrinsic::x86_avx_min_ps_256: 11282 case Intrinsic::x86_avx_min_pd_256: 11283 case Intrinsic::x86_avx512_min_ps_512: 11284 case Intrinsic::x86_avx512_min_pd_512: { 11285 unsigned Opcode; 11286 switch (IntNo) { 11287 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11288 case Intrinsic::x86_sse_max_ps: 11289 case Intrinsic::x86_sse2_max_pd: 11290 case Intrinsic::x86_avx_max_ps_256: 11291 case Intrinsic::x86_avx_max_pd_256: 11292 case Intrinsic::x86_avx512_max_ps_512: 11293 case Intrinsic::x86_avx512_max_pd_512: 11294 Opcode = X86ISD::FMAX; 11295 break; 11296 case Intrinsic::x86_sse_min_ps: 11297 case Intrinsic::x86_sse2_min_pd: 11298 case Intrinsic::x86_avx_min_ps_256: 11299 case Intrinsic::x86_avx_min_pd_256: 11300 case Intrinsic::x86_avx512_min_ps_512: 11301 case Intrinsic::x86_avx512_min_pd_512: 11302 Opcode = X86ISD::FMIN; 11303 break; 11304 } 11305 return DAG.getNode(Opcode, dl, Op.getValueType(), 11306 Op.getOperand(1), Op.getOperand(2)); 11307 } 11308 11309 // AVX2 variable shift intrinsics 11310 case Intrinsic::x86_avx2_psllv_d: 11311 case Intrinsic::x86_avx2_psllv_q: 11312 case Intrinsic::x86_avx2_psllv_d_256: 11313 case Intrinsic::x86_avx2_psllv_q_256: 11314 case Intrinsic::x86_avx2_psrlv_d: 11315 case Intrinsic::x86_avx2_psrlv_q: 11316 case Intrinsic::x86_avx2_psrlv_d_256: 11317 case Intrinsic::x86_avx2_psrlv_q_256: 11318 case Intrinsic::x86_avx2_psrav_d: 11319 case Intrinsic::x86_avx2_psrav_d_256: { 11320 unsigned Opcode; 11321 switch (IntNo) { 11322 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11323 case Intrinsic::x86_avx2_psllv_d: 11324 case Intrinsic::x86_avx2_psllv_q: 11325 case Intrinsic::x86_avx2_psllv_d_256: 11326 case Intrinsic::x86_avx2_psllv_q_256: 11327 Opcode = ISD::SHL; 11328 break; 11329 case Intrinsic::x86_avx2_psrlv_d: 11330 case Intrinsic::x86_avx2_psrlv_q: 11331 case Intrinsic::x86_avx2_psrlv_d_256: 11332 case Intrinsic::x86_avx2_psrlv_q_256: 11333 Opcode = ISD::SRL; 11334 break; 11335 case Intrinsic::x86_avx2_psrav_d: 11336 case Intrinsic::x86_avx2_psrav_d_256: 11337 Opcode = ISD::SRA; 11338 break; 11339 } 11340 return DAG.getNode(Opcode, dl, Op.getValueType(), 11341 Op.getOperand(1), Op.getOperand(2)); 11342 } 11343 11344 case Intrinsic::x86_ssse3_pshuf_b_128: 11345 case Intrinsic::x86_avx2_pshuf_b: 11346 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 11347 Op.getOperand(1), Op.getOperand(2)); 11348 11349 case Intrinsic::x86_ssse3_psign_b_128: 11350 case Intrinsic::x86_ssse3_psign_w_128: 11351 case Intrinsic::x86_ssse3_psign_d_128: 11352 case Intrinsic::x86_avx2_psign_b: 11353 case Intrinsic::x86_avx2_psign_w: 11354 case Intrinsic::x86_avx2_psign_d: 11355 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 11356 Op.getOperand(1), Op.getOperand(2)); 11357 11358 case Intrinsic::x86_sse41_insertps: 11359 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 11360 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11361 11362 case Intrinsic::x86_avx_vperm2f128_ps_256: 11363 case Intrinsic::x86_avx_vperm2f128_pd_256: 11364 case Intrinsic::x86_avx_vperm2f128_si_256: 11365 case Intrinsic::x86_avx2_vperm2i128: 11366 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 11367 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11368 11369 case Intrinsic::x86_avx2_permd: 11370 case Intrinsic::x86_avx2_permps: 11371 // Operands intentionally swapped. Mask is last operand to intrinsic, 11372 // but second operand for node/instruction. 11373 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 11374 Op.getOperand(2), Op.getOperand(1)); 11375 11376 case Intrinsic::x86_sse_sqrt_ps: 11377 case Intrinsic::x86_sse2_sqrt_pd: 11378 case Intrinsic::x86_avx_sqrt_ps_256: 11379 case Intrinsic::x86_avx_sqrt_pd_256: 11380 return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); 11381 11382 // ptest and testp intrinsics. The intrinsic these come from are designed to 11383 // return an integer value, not just an instruction so lower it to the ptest 11384 // or testp pattern and a setcc for the result. 11385 case Intrinsic::x86_sse41_ptestz: 11386 case Intrinsic::x86_sse41_ptestc: 11387 case Intrinsic::x86_sse41_ptestnzc: 11388 case Intrinsic::x86_avx_ptestz_256: 11389 case Intrinsic::x86_avx_ptestc_256: 11390 case Intrinsic::x86_avx_ptestnzc_256: 11391 case Intrinsic::x86_avx_vtestz_ps: 11392 case Intrinsic::x86_avx_vtestc_ps: 11393 case Intrinsic::x86_avx_vtestnzc_ps: 11394 case Intrinsic::x86_avx_vtestz_pd: 11395 case Intrinsic::x86_avx_vtestc_pd: 11396 case Intrinsic::x86_avx_vtestnzc_pd: 11397 case Intrinsic::x86_avx_vtestz_ps_256: 11398 case Intrinsic::x86_avx_vtestc_ps_256: 11399 case Intrinsic::x86_avx_vtestnzc_ps_256: 11400 case Intrinsic::x86_avx_vtestz_pd_256: 11401 case Intrinsic::x86_avx_vtestc_pd_256: 11402 case Intrinsic::x86_avx_vtestnzc_pd_256: { 11403 bool IsTestPacked = false; 11404 unsigned X86CC; 11405 switch (IntNo) { 11406 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 11407 case Intrinsic::x86_avx_vtestz_ps: 11408 case Intrinsic::x86_avx_vtestz_pd: 11409 case Intrinsic::x86_avx_vtestz_ps_256: 11410 case Intrinsic::x86_avx_vtestz_pd_256: 11411 IsTestPacked = true; // Fallthrough 11412 case Intrinsic::x86_sse41_ptestz: 11413 case Intrinsic::x86_avx_ptestz_256: 11414 // ZF = 1 11415 X86CC = X86::COND_E; 11416 break; 11417 case Intrinsic::x86_avx_vtestc_ps: 11418 case Intrinsic::x86_avx_vtestc_pd: 11419 case Intrinsic::x86_avx_vtestc_ps_256: 11420 case Intrinsic::x86_avx_vtestc_pd_256: 11421 IsTestPacked = true; // Fallthrough 11422 case Intrinsic::x86_sse41_ptestc: 11423 case Intrinsic::x86_avx_ptestc_256: 11424 // CF = 1 11425 X86CC = X86::COND_B; 11426 break; 11427 case Intrinsic::x86_avx_vtestnzc_ps: 11428 case Intrinsic::x86_avx_vtestnzc_pd: 11429 case Intrinsic::x86_avx_vtestnzc_ps_256: 11430 case Intrinsic::x86_avx_vtestnzc_pd_256: 11431 IsTestPacked = true; // Fallthrough 11432 case Intrinsic::x86_sse41_ptestnzc: 11433 case Intrinsic::x86_avx_ptestnzc_256: 11434 // ZF and CF = 0 11435 X86CC = X86::COND_A; 11436 break; 11437 } 11438 11439 SDValue LHS = Op.getOperand(1); 11440 SDValue RHS = Op.getOperand(2); 11441 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 11442 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 11443 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11444 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11445 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11446 } 11447 case Intrinsic::x86_avx512_kortestz: 11448 case Intrinsic::x86_avx512_kortestc: { 11449 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; 11450 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); 11451 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); 11452 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11453 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 11454 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11455 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11456 } 11457 11458 // SSE/AVX shift intrinsics 11459 case Intrinsic::x86_sse2_psll_w: 11460 case Intrinsic::x86_sse2_psll_d: 11461 case Intrinsic::x86_sse2_psll_q: 11462 case Intrinsic::x86_avx2_psll_w: 11463 case Intrinsic::x86_avx2_psll_d: 11464 case Intrinsic::x86_avx2_psll_q: 11465 case Intrinsic::x86_sse2_psrl_w: 11466 case Intrinsic::x86_sse2_psrl_d: 11467 case Intrinsic::x86_sse2_psrl_q: 11468 case Intrinsic::x86_avx2_psrl_w: 11469 case Intrinsic::x86_avx2_psrl_d: 11470 case Intrinsic::x86_avx2_psrl_q: 11471 case Intrinsic::x86_sse2_psra_w: 11472 case Intrinsic::x86_sse2_psra_d: 11473 case Intrinsic::x86_avx2_psra_w: 11474 case Intrinsic::x86_avx2_psra_d: { 11475 unsigned Opcode; 11476 switch (IntNo) { 11477 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11478 case Intrinsic::x86_sse2_psll_w: 11479 case Intrinsic::x86_sse2_psll_d: 11480 case Intrinsic::x86_sse2_psll_q: 11481 case Intrinsic::x86_avx2_psll_w: 11482 case Intrinsic::x86_avx2_psll_d: 11483 case Intrinsic::x86_avx2_psll_q: 11484 Opcode = X86ISD::VSHL; 11485 break; 11486 case Intrinsic::x86_sse2_psrl_w: 11487 case Intrinsic::x86_sse2_psrl_d: 11488 case Intrinsic::x86_sse2_psrl_q: 11489 case Intrinsic::x86_avx2_psrl_w: 11490 case Intrinsic::x86_avx2_psrl_d: 11491 case Intrinsic::x86_avx2_psrl_q: 11492 Opcode = X86ISD::VSRL; 11493 break; 11494 case Intrinsic::x86_sse2_psra_w: 11495 case Intrinsic::x86_sse2_psra_d: 11496 case Intrinsic::x86_avx2_psra_w: 11497 case Intrinsic::x86_avx2_psra_d: 11498 Opcode = X86ISD::VSRA; 11499 break; 11500 } 11501 return DAG.getNode(Opcode, dl, Op.getValueType(), 11502 Op.getOperand(1), Op.getOperand(2)); 11503 } 11504 11505 // SSE/AVX immediate shift intrinsics 11506 case Intrinsic::x86_sse2_pslli_w: 11507 case Intrinsic::x86_sse2_pslli_d: 11508 case Intrinsic::x86_sse2_pslli_q: 11509 case Intrinsic::x86_avx2_pslli_w: 11510 case Intrinsic::x86_avx2_pslli_d: 11511 case Intrinsic::x86_avx2_pslli_q: 11512 case Intrinsic::x86_sse2_psrli_w: 11513 case Intrinsic::x86_sse2_psrli_d: 11514 case Intrinsic::x86_sse2_psrli_q: 11515 case Intrinsic::x86_avx2_psrli_w: 11516 case Intrinsic::x86_avx2_psrli_d: 11517 case Intrinsic::x86_avx2_psrli_q: 11518 case Intrinsic::x86_sse2_psrai_w: 11519 case Intrinsic::x86_sse2_psrai_d: 11520 case Intrinsic::x86_avx2_psrai_w: 11521 case Intrinsic::x86_avx2_psrai_d: { 11522 unsigned Opcode; 11523 switch (IntNo) { 11524 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11525 case Intrinsic::x86_sse2_pslli_w: 11526 case Intrinsic::x86_sse2_pslli_d: 11527 case Intrinsic::x86_sse2_pslli_q: 11528 case Intrinsic::x86_avx2_pslli_w: 11529 case Intrinsic::x86_avx2_pslli_d: 11530 case Intrinsic::x86_avx2_pslli_q: 11531 Opcode = X86ISD::VSHLI; 11532 break; 11533 case Intrinsic::x86_sse2_psrli_w: 11534 case Intrinsic::x86_sse2_psrli_d: 11535 case Intrinsic::x86_sse2_psrli_q: 11536 case Intrinsic::x86_avx2_psrli_w: 11537 case Intrinsic::x86_avx2_psrli_d: 11538 case Intrinsic::x86_avx2_psrli_q: 11539 Opcode = X86ISD::VSRLI; 11540 break; 11541 case Intrinsic::x86_sse2_psrai_w: 11542 case Intrinsic::x86_sse2_psrai_d: 11543 case Intrinsic::x86_avx2_psrai_w: 11544 case Intrinsic::x86_avx2_psrai_d: 11545 Opcode = X86ISD::VSRAI; 11546 break; 11547 } 11548 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 11549 Op.getOperand(1), Op.getOperand(2), DAG); 11550 } 11551 11552 case Intrinsic::x86_sse42_pcmpistria128: 11553 case Intrinsic::x86_sse42_pcmpestria128: 11554 case Intrinsic::x86_sse42_pcmpistric128: 11555 case Intrinsic::x86_sse42_pcmpestric128: 11556 case Intrinsic::x86_sse42_pcmpistrio128: 11557 case Intrinsic::x86_sse42_pcmpestrio128: 11558 case Intrinsic::x86_sse42_pcmpistris128: 11559 case Intrinsic::x86_sse42_pcmpestris128: 11560 case Intrinsic::x86_sse42_pcmpistriz128: 11561 case Intrinsic::x86_sse42_pcmpestriz128: { 11562 unsigned Opcode; 11563 unsigned X86CC; 11564 switch (IntNo) { 11565 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11566 case Intrinsic::x86_sse42_pcmpistria128: 11567 Opcode = X86ISD::PCMPISTRI; 11568 X86CC = X86::COND_A; 11569 break; 11570 case Intrinsic::x86_sse42_pcmpestria128: 11571 Opcode = X86ISD::PCMPESTRI; 11572 X86CC = X86::COND_A; 11573 break; 11574 case Intrinsic::x86_sse42_pcmpistric128: 11575 Opcode = X86ISD::PCMPISTRI; 11576 X86CC = X86::COND_B; 11577 break; 11578 case Intrinsic::x86_sse42_pcmpestric128: 11579 Opcode = X86ISD::PCMPESTRI; 11580 X86CC = X86::COND_B; 11581 break; 11582 case Intrinsic::x86_sse42_pcmpistrio128: 11583 Opcode = X86ISD::PCMPISTRI; 11584 X86CC = X86::COND_O; 11585 break; 11586 case Intrinsic::x86_sse42_pcmpestrio128: 11587 Opcode = X86ISD::PCMPESTRI; 11588 X86CC = X86::COND_O; 11589 break; 11590 case Intrinsic::x86_sse42_pcmpistris128: 11591 Opcode = X86ISD::PCMPISTRI; 11592 X86CC = X86::COND_S; 11593 break; 11594 case Intrinsic::x86_sse42_pcmpestris128: 11595 Opcode = X86ISD::PCMPESTRI; 11596 X86CC = X86::COND_S; 11597 break; 11598 case Intrinsic::x86_sse42_pcmpistriz128: 11599 Opcode = X86ISD::PCMPISTRI; 11600 X86CC = X86::COND_E; 11601 break; 11602 case Intrinsic::x86_sse42_pcmpestriz128: 11603 Opcode = X86ISD::PCMPESTRI; 11604 X86CC = X86::COND_E; 11605 break; 11606 } 11607 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11608 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11609 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11610 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11611 DAG.getConstant(X86CC, MVT::i8), 11612 SDValue(PCMP.getNode(), 1)); 11613 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11614 } 11615 11616 case Intrinsic::x86_sse42_pcmpistri128: 11617 case Intrinsic::x86_sse42_pcmpestri128: { 11618 unsigned Opcode; 11619 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 11620 Opcode = X86ISD::PCMPISTRI; 11621 else 11622 Opcode = X86ISD::PCMPESTRI; 11623 11624 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11625 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11626 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11627 } 11628 case Intrinsic::x86_fma_vfmadd_ps: 11629 case Intrinsic::x86_fma_vfmadd_pd: 11630 case Intrinsic::x86_fma_vfmsub_ps: 11631 case Intrinsic::x86_fma_vfmsub_pd: 11632 case Intrinsic::x86_fma_vfnmadd_ps: 11633 case Intrinsic::x86_fma_vfnmadd_pd: 11634 case Intrinsic::x86_fma_vfnmsub_ps: 11635 case Intrinsic::x86_fma_vfnmsub_pd: 11636 case Intrinsic::x86_fma_vfmaddsub_ps: 11637 case Intrinsic::x86_fma_vfmaddsub_pd: 11638 case Intrinsic::x86_fma_vfmsubadd_ps: 11639 case Intrinsic::x86_fma_vfmsubadd_pd: 11640 case Intrinsic::x86_fma_vfmadd_ps_256: 11641 case Intrinsic::x86_fma_vfmadd_pd_256: 11642 case Intrinsic::x86_fma_vfmsub_ps_256: 11643 case Intrinsic::x86_fma_vfmsub_pd_256: 11644 case Intrinsic::x86_fma_vfnmadd_ps_256: 11645 case Intrinsic::x86_fma_vfnmadd_pd_256: 11646 case Intrinsic::x86_fma_vfnmsub_ps_256: 11647 case Intrinsic::x86_fma_vfnmsub_pd_256: 11648 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11649 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11650 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11651 case Intrinsic::x86_fma_vfmsubadd_pd_256: 11652 case Intrinsic::x86_fma_vfmadd_ps_512: 11653 case Intrinsic::x86_fma_vfmadd_pd_512: 11654 case Intrinsic::x86_fma_vfmsub_ps_512: 11655 case Intrinsic::x86_fma_vfmsub_pd_512: 11656 case Intrinsic::x86_fma_vfnmadd_ps_512: 11657 case Intrinsic::x86_fma_vfnmadd_pd_512: 11658 case Intrinsic::x86_fma_vfnmsub_ps_512: 11659 case Intrinsic::x86_fma_vfnmsub_pd_512: 11660 case Intrinsic::x86_fma_vfmaddsub_ps_512: 11661 case Intrinsic::x86_fma_vfmaddsub_pd_512: 11662 case Intrinsic::x86_fma_vfmsubadd_ps_512: 11663 case Intrinsic::x86_fma_vfmsubadd_pd_512: { 11664 unsigned Opc; 11665 switch (IntNo) { 11666 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11667 case Intrinsic::x86_fma_vfmadd_ps: 11668 case Intrinsic::x86_fma_vfmadd_pd: 11669 case Intrinsic::x86_fma_vfmadd_ps_256: 11670 case Intrinsic::x86_fma_vfmadd_pd_256: 11671 case Intrinsic::x86_fma_vfmadd_ps_512: 11672 case Intrinsic::x86_fma_vfmadd_pd_512: 11673 Opc = X86ISD::FMADD; 11674 break; 11675 case Intrinsic::x86_fma_vfmsub_ps: 11676 case Intrinsic::x86_fma_vfmsub_pd: 11677 case Intrinsic::x86_fma_vfmsub_ps_256: 11678 case Intrinsic::x86_fma_vfmsub_pd_256: 11679 case Intrinsic::x86_fma_vfmsub_ps_512: 11680 case Intrinsic::x86_fma_vfmsub_pd_512: 11681 Opc = X86ISD::FMSUB; 11682 break; 11683 case Intrinsic::x86_fma_vfnmadd_ps: 11684 case Intrinsic::x86_fma_vfnmadd_pd: 11685 case Intrinsic::x86_fma_vfnmadd_ps_256: 11686 case Intrinsic::x86_fma_vfnmadd_pd_256: 11687 case Intrinsic::x86_fma_vfnmadd_ps_512: 11688 case Intrinsic::x86_fma_vfnmadd_pd_512: 11689 Opc = X86ISD::FNMADD; 11690 break; 11691 case Intrinsic::x86_fma_vfnmsub_ps: 11692 case Intrinsic::x86_fma_vfnmsub_pd: 11693 case Intrinsic::x86_fma_vfnmsub_ps_256: 11694 case Intrinsic::x86_fma_vfnmsub_pd_256: 11695 case Intrinsic::x86_fma_vfnmsub_ps_512: 11696 case Intrinsic::x86_fma_vfnmsub_pd_512: 11697 Opc = X86ISD::FNMSUB; 11698 break; 11699 case Intrinsic::x86_fma_vfmaddsub_ps: 11700 case Intrinsic::x86_fma_vfmaddsub_pd: 11701 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11702 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11703 case Intrinsic::x86_fma_vfmaddsub_ps_512: 11704 case Intrinsic::x86_fma_vfmaddsub_pd_512: 11705 Opc = X86ISD::FMADDSUB; 11706 break; 11707 case Intrinsic::x86_fma_vfmsubadd_ps: 11708 case Intrinsic::x86_fma_vfmsubadd_pd: 11709 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11710 case Intrinsic::x86_fma_vfmsubadd_pd_256: 11711 case Intrinsic::x86_fma_vfmsubadd_ps_512: 11712 case Intrinsic::x86_fma_vfmsubadd_pd_512: 11713 Opc = X86ISD::FMSUBADD; 11714 break; 11715 } 11716 11717 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 11718 Op.getOperand(2), Op.getOperand(3)); 11719 } 11720 } 11721} 11722 11723static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11724 SDValue Base, SDValue Index, 11725 SDValue ScaleOp, SDValue Chain, 11726 const X86Subtarget * Subtarget) { 11727 SDLoc dl(Op); 11728 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11729 assert(C && "Invalid scale type"); 11730 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11731 SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11732 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11733 Index.getValueType().getVectorNumElements()); 11734 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11735 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11736 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11737 SDValue Segment = DAG.getRegister(0, MVT::i32); 11738 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11739 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11740 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11741 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11742} 11743 11744static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11745 SDValue Src, SDValue Mask, SDValue Base, 11746 SDValue Index, SDValue ScaleOp, SDValue Chain, 11747 const X86Subtarget * Subtarget) { 11748 SDLoc dl(Op); 11749 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11750 assert(C && "Invalid scale type"); 11751 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11752 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11753 Index.getValueType().getVectorNumElements()); 11754 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11755 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11756 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11757 SDValue Segment = DAG.getRegister(0, MVT::i32); 11758 if (Src.getOpcode() == ISD::UNDEF) 11759 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11760 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11761 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11762 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11763 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11764} 11765 11766static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11767 SDValue Src, SDValue Base, SDValue Index, 11768 SDValue ScaleOp, SDValue Chain) { 11769 SDLoc dl(Op); 11770 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11771 assert(C && "Invalid scale type"); 11772 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11773 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11774 SDValue Segment = DAG.getRegister(0, MVT::i32); 11775 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11776 Index.getValueType().getVectorNumElements()); 11777 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11778 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11779 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11780 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11781 return SDValue(Res, 1); 11782} 11783 11784static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11785 SDValue Src, SDValue Mask, SDValue Base, 11786 SDValue Index, SDValue ScaleOp, SDValue Chain) { 11787 SDLoc dl(Op); 11788 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11789 assert(C && "Invalid scale type"); 11790 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11791 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11792 SDValue Segment = DAG.getRegister(0, MVT::i32); 11793 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11794 Index.getValueType().getVectorNumElements()); 11795 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11796 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11797 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11798 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11799 return SDValue(Res, 1); 11800} 11801 11802static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 11803 SelectionDAG &DAG) { 11804 SDLoc dl(Op); 11805 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11806 switch (IntNo) { 11807 default: return SDValue(); // Don't custom lower most intrinsics. 11808 11809 // RDRAND/RDSEED intrinsics. 11810 case Intrinsic::x86_rdrand_16: 11811 case Intrinsic::x86_rdrand_32: 11812 case Intrinsic::x86_rdrand_64: 11813 case Intrinsic::x86_rdseed_16: 11814 case Intrinsic::x86_rdseed_32: 11815 case Intrinsic::x86_rdseed_64: { 11816 unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || 11817 IntNo == Intrinsic::x86_rdseed_32 || 11818 IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : 11819 X86ISD::RDRAND; 11820 // Emit the node with the right value type. 11821 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 11822 SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); 11823 11824 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 11825 // Otherwise return the value from Rand, which is always 0, casted to i32. 11826 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 11827 DAG.getConstant(1, Op->getValueType(1)), 11828 DAG.getConstant(X86::COND_B, MVT::i32), 11829 SDValue(Result.getNode(), 1) }; 11830 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 11831 DAG.getVTList(Op->getValueType(1), MVT::Glue), 11832 Ops, array_lengthof(Ops)); 11833 11834 // Return { result, isValid, chain }. 11835 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 11836 SDValue(Result.getNode(), 2)); 11837 } 11838 //int_gather(index, base, scale); 11839 case Intrinsic::x86_avx512_gather_qpd_512: 11840 case Intrinsic::x86_avx512_gather_qps_512: 11841 case Intrinsic::x86_avx512_gather_dpd_512: 11842 case Intrinsic::x86_avx512_gather_qpi_512: 11843 case Intrinsic::x86_avx512_gather_qpq_512: 11844 case Intrinsic::x86_avx512_gather_dpq_512: 11845 case Intrinsic::x86_avx512_gather_dps_512: 11846 case Intrinsic::x86_avx512_gather_dpi_512: { 11847 unsigned Opc; 11848 switch (IntNo) { 11849 default: llvm_unreachable("Unexpected intrinsic!"); 11850 case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; 11851 case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; 11852 case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; 11853 case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; 11854 case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; 11855 case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; 11856 case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; 11857 case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; 11858 } 11859 SDValue Chain = Op.getOperand(0); 11860 SDValue Index = Op.getOperand(2); 11861 SDValue Base = Op.getOperand(3); 11862 SDValue Scale = Op.getOperand(4); 11863 return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); 11864 } 11865 //int_gather_mask(v1, mask, index, base, scale); 11866 case Intrinsic::x86_avx512_gather_qps_mask_512: 11867 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11868 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11869 case Intrinsic::x86_avx512_gather_dps_mask_512: 11870 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11871 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11872 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11873 case Intrinsic::x86_avx512_gather_dpq_mask_512: { 11874 unsigned Opc; 11875 switch (IntNo) { 11876 default: llvm_unreachable("Unexpected intrinsic!"); 11877 case Intrinsic::x86_avx512_gather_qps_mask_512: 11878 Opc = X86::VGATHERQPSZrm; break; 11879 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11880 Opc = X86::VGATHERQPDZrm; break; 11881 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11882 Opc = X86::VGATHERDPDZrm; break; 11883 case Intrinsic::x86_avx512_gather_dps_mask_512: 11884 Opc = X86::VGATHERDPSZrm; break; 11885 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11886 Opc = X86::VPGATHERQDZrm; break; 11887 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11888 Opc = X86::VPGATHERQQZrm; break; 11889 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11890 Opc = X86::VPGATHERDDZrm; break; 11891 case Intrinsic::x86_avx512_gather_dpq_mask_512: 11892 Opc = X86::VPGATHERDQZrm; break; 11893 } 11894 SDValue Chain = Op.getOperand(0); 11895 SDValue Src = Op.getOperand(2); 11896 SDValue Mask = Op.getOperand(3); 11897 SDValue Index = Op.getOperand(4); 11898 SDValue Base = Op.getOperand(5); 11899 SDValue Scale = Op.getOperand(6); 11900 return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, 11901 Subtarget); 11902 } 11903 //int_scatter(base, index, v1, scale); 11904 case Intrinsic::x86_avx512_scatter_qpd_512: 11905 case Intrinsic::x86_avx512_scatter_qps_512: 11906 case Intrinsic::x86_avx512_scatter_dpd_512: 11907 case Intrinsic::x86_avx512_scatter_qpi_512: 11908 case Intrinsic::x86_avx512_scatter_qpq_512: 11909 case Intrinsic::x86_avx512_scatter_dpq_512: 11910 case Intrinsic::x86_avx512_scatter_dps_512: 11911 case Intrinsic::x86_avx512_scatter_dpi_512: { 11912 unsigned Opc; 11913 switch (IntNo) { 11914 default: llvm_unreachable("Unexpected intrinsic!"); 11915 case Intrinsic::x86_avx512_scatter_qpd_512: 11916 Opc = X86::VSCATTERQPDZmr; break; 11917 case Intrinsic::x86_avx512_scatter_qps_512: 11918 Opc = X86::VSCATTERQPSZmr; break; 11919 case Intrinsic::x86_avx512_scatter_dpd_512: 11920 Opc = X86::VSCATTERDPDZmr; break; 11921 case Intrinsic::x86_avx512_scatter_dps_512: 11922 Opc = X86::VSCATTERDPSZmr; break; 11923 case Intrinsic::x86_avx512_scatter_qpi_512: 11924 Opc = X86::VPSCATTERQDZmr; break; 11925 case Intrinsic::x86_avx512_scatter_qpq_512: 11926 Opc = X86::VPSCATTERQQZmr; break; 11927 case Intrinsic::x86_avx512_scatter_dpq_512: 11928 Opc = X86::VPSCATTERDQZmr; break; 11929 case Intrinsic::x86_avx512_scatter_dpi_512: 11930 Opc = X86::VPSCATTERDDZmr; break; 11931 } 11932 SDValue Chain = Op.getOperand(0); 11933 SDValue Base = Op.getOperand(2); 11934 SDValue Index = Op.getOperand(3); 11935 SDValue Src = Op.getOperand(4); 11936 SDValue Scale = Op.getOperand(5); 11937 return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); 11938 } 11939 //int_scatter_mask(base, mask, index, v1, scale); 11940 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11941 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11942 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11943 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11944 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11945 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11946 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11947 case Intrinsic::x86_avx512_scatter_dpq_mask_512: { 11948 unsigned Opc; 11949 switch (IntNo) { 11950 default: llvm_unreachable("Unexpected intrinsic!"); 11951 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11952 Opc = X86::VSCATTERQPDZmr; break; 11953 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11954 Opc = X86::VSCATTERQPSZmr; break; 11955 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11956 Opc = X86::VSCATTERDPDZmr; break; 11957 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11958 Opc = X86::VSCATTERDPSZmr; break; 11959 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11960 Opc = X86::VPSCATTERQDZmr; break; 11961 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11962 Opc = X86::VPSCATTERQQZmr; break; 11963 case Intrinsic::x86_avx512_scatter_dpq_mask_512: 11964 Opc = X86::VPSCATTERDQZmr; break; 11965 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11966 Opc = X86::VPSCATTERDDZmr; break; 11967 } 11968 SDValue Chain = Op.getOperand(0); 11969 SDValue Base = Op.getOperand(2); 11970 SDValue Mask = Op.getOperand(3); 11971 SDValue Index = Op.getOperand(4); 11972 SDValue Src = Op.getOperand(5); 11973 SDValue Scale = Op.getOperand(6); 11974 return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); 11975 } 11976 // XTEST intrinsics. 11977 case Intrinsic::x86_xtest: { 11978 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 11979 SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); 11980 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11981 DAG.getConstant(X86::COND_NE, MVT::i8), 11982 InTrans); 11983 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 11984 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 11985 Ret, SDValue(InTrans.getNode(), 1)); 11986 } 11987 } 11988} 11989 11990SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 11991 SelectionDAG &DAG) const { 11992 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11993 MFI->setReturnAddressIsTaken(true); 11994 11995 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11996 SDLoc dl(Op); 11997 EVT PtrVT = getPointerTy(); 11998 11999 if (Depth > 0) { 12000 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 12001 const X86RegisterInfo *RegInfo = 12002 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12003 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 12004 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12005 DAG.getNode(ISD::ADD, dl, PtrVT, 12006 FrameAddr, Offset), 12007 MachinePointerInfo(), false, false, false, 0); 12008 } 12009 12010 // Just load the return address. 12011 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 12012 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12013 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 12014} 12015 12016SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 12017 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12018 MFI->setFrameAddressIsTaken(true); 12019 12020 EVT VT = Op.getValueType(); 12021 SDLoc dl(Op); // FIXME probably not meaningful 12022 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12023 const X86RegisterInfo *RegInfo = 12024 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12025 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12026 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 12027 (FrameReg == X86::EBP && VT == MVT::i32)) && 12028 "Invalid Frame Register!"); 12029 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 12030 while (Depth--) 12031 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 12032 MachinePointerInfo(), 12033 false, false, false, 0); 12034 return FrameAddr; 12035} 12036 12037SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 12038 SelectionDAG &DAG) const { 12039 const X86RegisterInfo *RegInfo = 12040 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12041 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 12042} 12043 12044SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 12045 SDValue Chain = Op.getOperand(0); 12046 SDValue Offset = Op.getOperand(1); 12047 SDValue Handler = Op.getOperand(2); 12048 SDLoc dl (Op); 12049 12050 EVT PtrVT = getPointerTy(); 12051 const X86RegisterInfo *RegInfo = 12052 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12053 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12054 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 12055 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 12056 "Invalid Frame Register!"); 12057 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 12058 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 12059 12060 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 12061 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 12062 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 12063 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 12064 false, false, 0); 12065 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 12066 12067 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 12068 DAG.getRegister(StoreAddrReg, PtrVT)); 12069} 12070 12071SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 12072 SelectionDAG &DAG) const { 12073 SDLoc DL(Op); 12074 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 12075 DAG.getVTList(MVT::i32, MVT::Other), 12076 Op.getOperand(0), Op.getOperand(1)); 12077} 12078 12079SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 12080 SelectionDAG &DAG) const { 12081 SDLoc DL(Op); 12082 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 12083 Op.getOperand(0), Op.getOperand(1)); 12084} 12085 12086static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 12087 return Op.getOperand(0); 12088} 12089 12090SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 12091 SelectionDAG &DAG) const { 12092 SDValue Root = Op.getOperand(0); 12093 SDValue Trmp = Op.getOperand(1); // trampoline 12094 SDValue FPtr = Op.getOperand(2); // nested function 12095 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 12096 SDLoc dl (Op); 12097 12098 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 12099 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12100 12101 if (Subtarget->is64Bit()) { 12102 SDValue OutChains[6]; 12103 12104 // Large code-model. 12105 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 12106 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 12107 12108 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 12109 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 12110 12111 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 12112 12113 // Load the pointer to the nested function into R11. 12114 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 12115 SDValue Addr = Trmp; 12116 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12117 Addr, MachinePointerInfo(TrmpAddr), 12118 false, false, 0); 12119 12120 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12121 DAG.getConstant(2, MVT::i64)); 12122 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 12123 MachinePointerInfo(TrmpAddr, 2), 12124 false, false, 2); 12125 12126 // Load the 'nest' parameter value into R10. 12127 // R10 is specified in X86CallingConv.td 12128 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 12129 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12130 DAG.getConstant(10, MVT::i64)); 12131 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12132 Addr, MachinePointerInfo(TrmpAddr, 10), 12133 false, false, 0); 12134 12135 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12136 DAG.getConstant(12, MVT::i64)); 12137 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 12138 MachinePointerInfo(TrmpAddr, 12), 12139 false, false, 2); 12140 12141 // Jump to the nested function. 12142 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 12143 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12144 DAG.getConstant(20, MVT::i64)); 12145 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12146 Addr, MachinePointerInfo(TrmpAddr, 20), 12147 false, false, 0); 12148 12149 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 12150 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12151 DAG.getConstant(22, MVT::i64)); 12152 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 12153 MachinePointerInfo(TrmpAddr, 22), 12154 false, false, 0); 12155 12156 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 12157 } else { 12158 const Function *Func = 12159 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 12160 CallingConv::ID CC = Func->getCallingConv(); 12161 unsigned NestReg; 12162 12163 switch (CC) { 12164 default: 12165 llvm_unreachable("Unsupported calling convention"); 12166 case CallingConv::C: 12167 case CallingConv::X86_StdCall: { 12168 // Pass 'nest' parameter in ECX. 12169 // Must be kept in sync with X86CallingConv.td 12170 NestReg = X86::ECX; 12171 12172 // Check that ECX wasn't needed by an 'inreg' parameter. 12173 FunctionType *FTy = Func->getFunctionType(); 12174 const AttributeSet &Attrs = Func->getAttributes(); 12175 12176 if (!Attrs.isEmpty() && !Func->isVarArg()) { 12177 unsigned InRegCount = 0; 12178 unsigned Idx = 1; 12179 12180 for (FunctionType::param_iterator I = FTy->param_begin(), 12181 E = FTy->param_end(); I != E; ++I, ++Idx) 12182 if (Attrs.hasAttribute(Idx, Attribute::InReg)) 12183 // FIXME: should only count parameters that are lowered to integers. 12184 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 12185 12186 if (InRegCount > 2) { 12187 report_fatal_error("Nest register in use - reduce number of inreg" 12188 " parameters!"); 12189 } 12190 } 12191 break; 12192 } 12193 case CallingConv::X86_FastCall: 12194 case CallingConv::X86_ThisCall: 12195 case CallingConv::Fast: 12196 // Pass 'nest' parameter in EAX. 12197 // Must be kept in sync with X86CallingConv.td 12198 NestReg = X86::EAX; 12199 break; 12200 } 12201 12202 SDValue OutChains[4]; 12203 SDValue Addr, Disp; 12204 12205 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12206 DAG.getConstant(10, MVT::i32)); 12207 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 12208 12209 // This is storing the opcode for MOV32ri. 12210 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 12211 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 12212 OutChains[0] = DAG.getStore(Root, dl, 12213 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 12214 Trmp, MachinePointerInfo(TrmpAddr), 12215 false, false, 0); 12216 12217 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12218 DAG.getConstant(1, MVT::i32)); 12219 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 12220 MachinePointerInfo(TrmpAddr, 1), 12221 false, false, 1); 12222 12223 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 12224 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12225 DAG.getConstant(5, MVT::i32)); 12226 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 12227 MachinePointerInfo(TrmpAddr, 5), 12228 false, false, 1); 12229 12230 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12231 DAG.getConstant(6, MVT::i32)); 12232 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 12233 MachinePointerInfo(TrmpAddr, 6), 12234 false, false, 1); 12235 12236 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 12237 } 12238} 12239 12240SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 12241 SelectionDAG &DAG) const { 12242 /* 12243 The rounding mode is in bits 11:10 of FPSR, and has the following 12244 settings: 12245 00 Round to nearest 12246 01 Round to -inf 12247 10 Round to +inf 12248 11 Round to 0 12249 12250 FLT_ROUNDS, on the other hand, expects the following: 12251 -1 Undefined 12252 0 Round to 0 12253 1 Round to nearest 12254 2 Round to +inf 12255 3 Round to -inf 12256 12257 To perform the conversion, we do: 12258 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 12259 */ 12260 12261 MachineFunction &MF = DAG.getMachineFunction(); 12262 const TargetMachine &TM = MF.getTarget(); 12263 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 12264 unsigned StackAlignment = TFI.getStackAlignment(); 12265 EVT VT = Op.getValueType(); 12266 SDLoc DL(Op); 12267 12268 // Save FP Control Word to stack slot 12269 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 12270 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 12271 12272 MachineMemOperand *MMO = 12273 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12274 MachineMemOperand::MOStore, 2, 2); 12275 12276 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 12277 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 12278 DAG.getVTList(MVT::Other), 12279 Ops, array_lengthof(Ops), MVT::i16, 12280 MMO); 12281 12282 // Load FP Control Word from stack slot 12283 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 12284 MachinePointerInfo(), false, false, false, 0); 12285 12286 // Transform as necessary 12287 SDValue CWD1 = 12288 DAG.getNode(ISD::SRL, DL, MVT::i16, 12289 DAG.getNode(ISD::AND, DL, MVT::i16, 12290 CWD, DAG.getConstant(0x800, MVT::i16)), 12291 DAG.getConstant(11, MVT::i8)); 12292 SDValue CWD2 = 12293 DAG.getNode(ISD::SRL, DL, MVT::i16, 12294 DAG.getNode(ISD::AND, DL, MVT::i16, 12295 CWD, DAG.getConstant(0x400, MVT::i16)), 12296 DAG.getConstant(9, MVT::i8)); 12297 12298 SDValue RetVal = 12299 DAG.getNode(ISD::AND, DL, MVT::i16, 12300 DAG.getNode(ISD::ADD, DL, MVT::i16, 12301 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 12302 DAG.getConstant(1, MVT::i16)), 12303 DAG.getConstant(3, MVT::i16)); 12304 12305 return DAG.getNode((VT.getSizeInBits() < 16 ? 12306 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 12307} 12308 12309static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 12310 EVT VT = Op.getValueType(); 12311 EVT OpVT = VT; 12312 unsigned NumBits = VT.getSizeInBits(); 12313 SDLoc dl(Op); 12314 12315 Op = Op.getOperand(0); 12316 if (VT == MVT::i8) { 12317 // Zero extend to i32 since there is not an i8 bsr. 12318 OpVT = MVT::i32; 12319 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12320 } 12321 12322 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 12323 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12324 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12325 12326 // If src is zero (i.e. bsr sets ZF), returns NumBits. 12327 SDValue Ops[] = { 12328 Op, 12329 DAG.getConstant(NumBits+NumBits-1, OpVT), 12330 DAG.getConstant(X86::COND_E, MVT::i8), 12331 Op.getValue(1) 12332 }; 12333 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 12334 12335 // Finally xor with NumBits-1. 12336 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12337 12338 if (VT == MVT::i8) 12339 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12340 return Op; 12341} 12342 12343static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 12344 EVT VT = Op.getValueType(); 12345 EVT OpVT = VT; 12346 unsigned NumBits = VT.getSizeInBits(); 12347 SDLoc dl(Op); 12348 12349 Op = Op.getOperand(0); 12350 if (VT == MVT::i8) { 12351 // Zero extend to i32 since there is not an i8 bsr. 12352 OpVT = MVT::i32; 12353 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12354 } 12355 12356 // Issue a bsr (scan bits in reverse). 12357 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12358 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12359 12360 // And xor with NumBits-1. 12361 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12362 12363 if (VT == MVT::i8) 12364 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12365 return Op; 12366} 12367 12368static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 12369 EVT VT = Op.getValueType(); 12370 unsigned NumBits = VT.getSizeInBits(); 12371 SDLoc dl(Op); 12372 Op = Op.getOperand(0); 12373 12374 // Issue a bsf (scan bits forward) which also sets EFLAGS. 12375 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12376 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 12377 12378 // If src is zero (i.e. bsf sets ZF), returns NumBits. 12379 SDValue Ops[] = { 12380 Op, 12381 DAG.getConstant(NumBits, VT), 12382 DAG.getConstant(X86::COND_E, MVT::i8), 12383 Op.getValue(1) 12384 }; 12385 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 12386} 12387 12388// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 12389// ones, and then concatenate the result back. 12390static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 12391 EVT VT = Op.getValueType(); 12392 12393 assert(VT.is256BitVector() && VT.isInteger() && 12394 "Unsupported value type for operation"); 12395 12396 unsigned NumElems = VT.getVectorNumElements(); 12397 SDLoc dl(Op); 12398 12399 // Extract the LHS vectors 12400 SDValue LHS = Op.getOperand(0); 12401 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 12402 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 12403 12404 // Extract the RHS vectors 12405 SDValue RHS = Op.getOperand(1); 12406 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 12407 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 12408 12409 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12410 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12411 12412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 12413 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 12414 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 12415} 12416 12417static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 12418 assert(Op.getValueType().is256BitVector() && 12419 Op.getValueType().isInteger() && 12420 "Only handle AVX 256-bit vector integer operation"); 12421 return Lower256IntArith(Op, DAG); 12422} 12423 12424static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 12425 assert(Op.getValueType().is256BitVector() && 12426 Op.getValueType().isInteger() && 12427 "Only handle AVX 256-bit vector integer operation"); 12428 return Lower256IntArith(Op, DAG); 12429} 12430 12431static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 12432 SelectionDAG &DAG) { 12433 SDLoc dl(Op); 12434 EVT VT = Op.getValueType(); 12435 12436 // Decompose 256-bit ops into smaller 128-bit ops. 12437 if (VT.is256BitVector() && !Subtarget->hasInt256()) 12438 return Lower256IntArith(Op, DAG); 12439 12440 SDValue A = Op.getOperand(0); 12441 SDValue B = Op.getOperand(1); 12442 12443 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 12444 if (VT == MVT::v4i32) { 12445 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 12446 "Should not custom lower when pmuldq is available!"); 12447 12448 // Extract the odd parts. 12449 static const int UnpackMask[] = { 1, -1, 3, -1 }; 12450 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 12451 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 12452 12453 // Multiply the even parts. 12454 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 12455 // Now multiply odd parts. 12456 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 12457 12458 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 12459 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 12460 12461 // Merge the two vectors back together with a shuffle. This expands into 2 12462 // shuffles. 12463 static const int ShufMask[] = { 0, 4, 2, 6 }; 12464 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 12465 } 12466 12467 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 12468 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 12469 12470 // Ahi = psrlqi(a, 32); 12471 // Bhi = psrlqi(b, 32); 12472 // 12473 // AloBlo = pmuludq(a, b); 12474 // AloBhi = pmuludq(a, Bhi); 12475 // AhiBlo = pmuludq(Ahi, b); 12476 12477 // AloBhi = psllqi(AloBhi, 32); 12478 // AhiBlo = psllqi(AhiBlo, 32); 12479 // return AloBlo + AloBhi + AhiBlo; 12480 12481 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 12482 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 12483 12484 // Bit cast to 32-bit vectors for MULUDQ 12485 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 12486 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 12487 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 12488 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 12489 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 12490 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 12491 12492 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 12493 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 12494 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 12495 12496 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 12497 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 12498 12499 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 12500 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 12501} 12502 12503static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 12504 EVT VT = Op.getValueType(); 12505 EVT EltTy = VT.getVectorElementType(); 12506 unsigned NumElts = VT.getVectorNumElements(); 12507 SDValue N0 = Op.getOperand(0); 12508 SDLoc dl(Op); 12509 12510 // Lower sdiv X, pow2-const. 12511 BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); 12512 if (!C) 12513 return SDValue(); 12514 12515 APInt SplatValue, SplatUndef; 12516 unsigned SplatBitSize; 12517 bool HasAnyUndefs; 12518 if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 12519 HasAnyUndefs) || 12520 EltTy.getSizeInBits() < SplatBitSize) 12521 return SDValue(); 12522 12523 if ((SplatValue != 0) && 12524 (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { 12525 unsigned Lg2 = SplatValue.countTrailingZeros(); 12526 // Splat the sign bit. 12527 SmallVector<SDValue, 16> Sz(NumElts, 12528 DAG.getConstant(EltTy.getSizeInBits() - 1, 12529 EltTy)); 12530 SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, 12531 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], 12532 NumElts)); 12533 // Add (N0 < 0) ? abs2 - 1 : 0; 12534 SmallVector<SDValue, 16> Amt(NumElts, 12535 DAG.getConstant(EltTy.getSizeInBits() - Lg2, 12536 EltTy)); 12537 SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, 12538 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], 12539 NumElts)); 12540 SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); 12541 SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); 12542 SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, 12543 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], 12544 NumElts)); 12545 12546 // If we're dividing by a positive value, we're done. Otherwise, we must 12547 // negate the result. 12548 if (SplatValue.isNonNegative()) 12549 return SRA; 12550 12551 SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); 12552 SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); 12553 return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); 12554 } 12555 return SDValue(); 12556} 12557 12558static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 12559 const X86Subtarget *Subtarget) { 12560 EVT VT = Op.getValueType(); 12561 SDLoc dl(Op); 12562 SDValue R = Op.getOperand(0); 12563 SDValue Amt = Op.getOperand(1); 12564 12565 // Optimize shl/srl/sra with constant shift amount. 12566 if (isSplatVector(Amt.getNode())) { 12567 SDValue SclrAmt = Amt->getOperand(0); 12568 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 12569 uint64_t ShiftAmt = C->getZExtValue(); 12570 12571 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 12572 (Subtarget->hasInt256() && 12573 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || 12574 (Subtarget->hasAVX512() && 12575 (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12576 if (Op.getOpcode() == ISD::SHL) 12577 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12578 DAG); 12579 if (Op.getOpcode() == ISD::SRL) 12580 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12581 DAG); 12582 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 12583 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12584 DAG); 12585 } 12586 12587 if (VT == MVT::v16i8) { 12588 if (Op.getOpcode() == ISD::SHL) { 12589 // Make a large shift. 12590 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12591 MVT::v8i16, R, ShiftAmt, 12592 DAG); 12593 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12594 // Zero out the rightmost bits. 12595 SmallVector<SDValue, 16> V(16, 12596 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12597 MVT::i8)); 12598 return DAG.getNode(ISD::AND, dl, VT, SHL, 12599 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12600 } 12601 if (Op.getOpcode() == ISD::SRL) { 12602 // Make a large shift. 12603 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12604 MVT::v8i16, R, ShiftAmt, 12605 DAG); 12606 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12607 // Zero out the leftmost bits. 12608 SmallVector<SDValue, 16> V(16, 12609 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12610 MVT::i8)); 12611 return DAG.getNode(ISD::AND, dl, VT, SRL, 12612 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12613 } 12614 if (Op.getOpcode() == ISD::SRA) { 12615 if (ShiftAmt == 7) { 12616 // R s>> 7 === R s< 0 12617 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12618 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12619 } 12620 12621 // R s>> a === ((R u>> a) ^ m) - m 12622 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12623 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 12624 MVT::i8)); 12625 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 12626 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12627 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12628 return Res; 12629 } 12630 llvm_unreachable("Unknown shift opcode."); 12631 } 12632 12633 if (Subtarget->hasInt256() && VT == MVT::v32i8) { 12634 if (Op.getOpcode() == ISD::SHL) { 12635 // Make a large shift. 12636 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12637 MVT::v16i16, R, ShiftAmt, 12638 DAG); 12639 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12640 // Zero out the rightmost bits. 12641 SmallVector<SDValue, 32> V(32, 12642 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12643 MVT::i8)); 12644 return DAG.getNode(ISD::AND, dl, VT, SHL, 12645 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12646 } 12647 if (Op.getOpcode() == ISD::SRL) { 12648 // Make a large shift. 12649 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12650 MVT::v16i16, R, ShiftAmt, 12651 DAG); 12652 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12653 // Zero out the leftmost bits. 12654 SmallVector<SDValue, 32> V(32, 12655 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12656 MVT::i8)); 12657 return DAG.getNode(ISD::AND, dl, VT, SRL, 12658 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12659 } 12660 if (Op.getOpcode() == ISD::SRA) { 12661 if (ShiftAmt == 7) { 12662 // R s>> 7 === R s< 0 12663 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12664 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12665 } 12666 12667 // R s>> a === ((R u>> a) ^ m) - m 12668 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12669 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 12670 MVT::i8)); 12671 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 12672 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12673 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12674 return Res; 12675 } 12676 llvm_unreachable("Unknown shift opcode."); 12677 } 12678 } 12679 } 12680 12681 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12682 if (!Subtarget->is64Bit() && 12683 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 12684 Amt.getOpcode() == ISD::BITCAST && 12685 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12686 Amt = Amt.getOperand(0); 12687 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12688 VT.getVectorNumElements(); 12689 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 12690 uint64_t ShiftAmt = 0; 12691 for (unsigned i = 0; i != Ratio; ++i) { 12692 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); 12693 if (C == 0) 12694 return SDValue(); 12695 // 6 == Log2(64) 12696 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 12697 } 12698 // Check remaining shift amounts. 12699 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12700 uint64_t ShAmt = 0; 12701 for (unsigned j = 0; j != Ratio; ++j) { 12702 ConstantSDNode *C = 12703 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 12704 if (C == 0) 12705 return SDValue(); 12706 // 6 == Log2(64) 12707 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 12708 } 12709 if (ShAmt != ShiftAmt) 12710 return SDValue(); 12711 } 12712 switch (Op.getOpcode()) { 12713 default: 12714 llvm_unreachable("Unknown shift opcode!"); 12715 case ISD::SHL: 12716 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12717 DAG); 12718 case ISD::SRL: 12719 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12720 DAG); 12721 case ISD::SRA: 12722 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12723 DAG); 12724 } 12725 } 12726 12727 return SDValue(); 12728} 12729 12730static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 12731 const X86Subtarget* Subtarget) { 12732 EVT VT = Op.getValueType(); 12733 SDLoc dl(Op); 12734 SDValue R = Op.getOperand(0); 12735 SDValue Amt = Op.getOperand(1); 12736 12737 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || 12738 VT == MVT::v4i32 || VT == MVT::v8i16 || 12739 (Subtarget->hasInt256() && 12740 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || 12741 VT == MVT::v8i32 || VT == MVT::v16i16)) || 12742 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12743 SDValue BaseShAmt; 12744 EVT EltVT = VT.getVectorElementType(); 12745 12746 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12747 unsigned NumElts = VT.getVectorNumElements(); 12748 unsigned i, j; 12749 for (i = 0; i != NumElts; ++i) { 12750 if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) 12751 continue; 12752 break; 12753 } 12754 for (j = i; j != NumElts; ++j) { 12755 SDValue Arg = Amt.getOperand(j); 12756 if (Arg.getOpcode() == ISD::UNDEF) continue; 12757 if (Arg != Amt.getOperand(i)) 12758 break; 12759 } 12760 if (i != NumElts && j == NumElts) 12761 BaseShAmt = Amt.getOperand(i); 12762 } else { 12763 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 12764 Amt = Amt.getOperand(0); 12765 if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && 12766 cast<ShuffleVectorSDNode>(Amt)->isSplat()) { 12767 SDValue InVec = Amt.getOperand(0); 12768 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12769 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12770 unsigned i = 0; 12771 for (; i != NumElts; ++i) { 12772 SDValue Arg = InVec.getOperand(i); 12773 if (Arg.getOpcode() == ISD::UNDEF) continue; 12774 BaseShAmt = Arg; 12775 break; 12776 } 12777 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12778 if (ConstantSDNode *C = 12779 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12780 unsigned SplatIdx = 12781 cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); 12782 if (C->getZExtValue() == SplatIdx) 12783 BaseShAmt = InVec.getOperand(1); 12784 } 12785 } 12786 if (BaseShAmt.getNode() == 0) 12787 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, 12788 DAG.getIntPtrConstant(0)); 12789 } 12790 } 12791 12792 if (BaseShAmt.getNode()) { 12793 if (EltVT.bitsGT(MVT::i32)) 12794 BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); 12795 else if (EltVT.bitsLT(MVT::i32)) 12796 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 12797 12798 switch (Op.getOpcode()) { 12799 default: 12800 llvm_unreachable("Unknown shift opcode!"); 12801 case ISD::SHL: 12802 switch (VT.getSimpleVT().SimpleTy) { 12803 default: return SDValue(); 12804 case MVT::v2i64: 12805 case MVT::v4i32: 12806 case MVT::v8i16: 12807 case MVT::v4i64: 12808 case MVT::v8i32: 12809 case MVT::v16i16: 12810 case MVT::v16i32: 12811 case MVT::v8i64: 12812 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); 12813 } 12814 case ISD::SRA: 12815 switch (VT.getSimpleVT().SimpleTy) { 12816 default: return SDValue(); 12817 case MVT::v4i32: 12818 case MVT::v8i16: 12819 case MVT::v8i32: 12820 case MVT::v16i16: 12821 case MVT::v16i32: 12822 case MVT::v8i64: 12823 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); 12824 } 12825 case ISD::SRL: 12826 switch (VT.getSimpleVT().SimpleTy) { 12827 default: return SDValue(); 12828 case MVT::v2i64: 12829 case MVT::v4i32: 12830 case MVT::v8i16: 12831 case MVT::v4i64: 12832 case MVT::v8i32: 12833 case MVT::v16i16: 12834 case MVT::v16i32: 12835 case MVT::v8i64: 12836 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); 12837 } 12838 } 12839 } 12840 } 12841 12842 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12843 if (!Subtarget->is64Bit() && 12844 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || 12845 (Subtarget->hasAVX512() && VT == MVT::v8i64)) && 12846 Amt.getOpcode() == ISD::BITCAST && 12847 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12848 Amt = Amt.getOperand(0); 12849 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12850 VT.getVectorNumElements(); 12851 std::vector<SDValue> Vals(Ratio); 12852 for (unsigned i = 0; i != Ratio; ++i) 12853 Vals[i] = Amt.getOperand(i); 12854 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12855 for (unsigned j = 0; j != Ratio; ++j) 12856 if (Vals[j] != Amt.getOperand(i + j)) 12857 return SDValue(); 12858 } 12859 switch (Op.getOpcode()) { 12860 default: 12861 llvm_unreachable("Unknown shift opcode!"); 12862 case ISD::SHL: 12863 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); 12864 case ISD::SRL: 12865 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); 12866 case ISD::SRA: 12867 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); 12868 } 12869 } 12870 12871 return SDValue(); 12872} 12873 12874static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 12875 SelectionDAG &DAG) { 12876 12877 EVT VT = Op.getValueType(); 12878 SDLoc dl(Op); 12879 SDValue R = Op.getOperand(0); 12880 SDValue Amt = Op.getOperand(1); 12881 SDValue V; 12882 12883 if (!Subtarget->hasSSE2()) 12884 return SDValue(); 12885 12886 V = LowerScalarImmediateShift(Op, DAG, Subtarget); 12887 if (V.getNode()) 12888 return V; 12889 12890 V = LowerScalarVariableShift(Op, DAG, Subtarget); 12891 if (V.getNode()) 12892 return V; 12893 12894 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) 12895 return Op; 12896 // AVX2 has VPSLLV/VPSRAV/VPSRLV. 12897 if (Subtarget->hasInt256()) { 12898 if (Op.getOpcode() == ISD::SRL && 12899 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12900 VT == MVT::v4i64 || VT == MVT::v8i32)) 12901 return Op; 12902 if (Op.getOpcode() == ISD::SHL && 12903 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12904 VT == MVT::v4i64 || VT == MVT::v8i32)) 12905 return Op; 12906 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) 12907 return Op; 12908 } 12909 12910 // Lower SHL with variable shift amount. 12911 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 12912 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); 12913 12914 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); 12915 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 12916 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 12917 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 12918 } 12919 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 12920 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 12921 12922 // a = a << 5; 12923 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); 12924 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 12925 12926 // Turn 'a' into a mask suitable for VSELECT 12927 SDValue VSelM = DAG.getConstant(0x80, VT); 12928 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12929 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12930 12931 SDValue CM1 = DAG.getConstant(0x0f, VT); 12932 SDValue CM2 = DAG.getConstant(0x3f, VT); 12933 12934 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 12935 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 12936 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); 12937 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12938 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12939 12940 // a += a 12941 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12942 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12943 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12944 12945 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 12946 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 12947 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); 12948 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12949 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12950 12951 // a += a 12952 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12953 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12954 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12955 12956 // return VSELECT(r, r+r, a); 12957 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 12958 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 12959 return R; 12960 } 12961 12962 // Decompose 256-bit shifts into smaller 128-bit shifts. 12963 if (VT.is256BitVector()) { 12964 unsigned NumElems = VT.getVectorNumElements(); 12965 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12966 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12967 12968 // Extract the two vectors 12969 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 12970 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 12971 12972 // Recreate the shift amount vectors 12973 SDValue Amt1, Amt2; 12974 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12975 // Constant shift amount 12976 SmallVector<SDValue, 4> Amt1Csts; 12977 SmallVector<SDValue, 4> Amt2Csts; 12978 for (unsigned i = 0; i != NumElems/2; ++i) 12979 Amt1Csts.push_back(Amt->getOperand(i)); 12980 for (unsigned i = NumElems/2; i != NumElems; ++i) 12981 Amt2Csts.push_back(Amt->getOperand(i)); 12982 12983 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 12984 &Amt1Csts[0], NumElems/2); 12985 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 12986 &Amt2Csts[0], NumElems/2); 12987 } else { 12988 // Variable shift amount 12989 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 12990 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 12991 } 12992 12993 // Issue new vector shifts for the smaller types 12994 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 12995 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 12996 12997 // Concatenate the result back 12998 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 12999 } 13000 13001 return SDValue(); 13002} 13003 13004static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 13005 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 13006 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 13007 // looks for this combo and may remove the "setcc" instruction if the "setcc" 13008 // has only one use. 13009 SDNode *N = Op.getNode(); 13010 SDValue LHS = N->getOperand(0); 13011 SDValue RHS = N->getOperand(1); 13012 unsigned BaseOp = 0; 13013 unsigned Cond = 0; 13014 SDLoc DL(Op); 13015 switch (Op.getOpcode()) { 13016 default: llvm_unreachable("Unknown ovf instruction!"); 13017 case ISD::SADDO: 13018 // A subtract of one will be selected as a INC. Note that INC doesn't 13019 // set CF, so we can't do this for UADDO. 13020 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 13021 if (C->isOne()) { 13022 BaseOp = X86ISD::INC; 13023 Cond = X86::COND_O; 13024 break; 13025 } 13026 BaseOp = X86ISD::ADD; 13027 Cond = X86::COND_O; 13028 break; 13029 case ISD::UADDO: 13030 BaseOp = X86ISD::ADD; 13031 Cond = X86::COND_B; 13032 break; 13033 case ISD::SSUBO: 13034 // A subtract of one will be selected as a DEC. Note that DEC doesn't 13035 // set CF, so we can't do this for USUBO. 13036 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 13037 if (C->isOne()) { 13038 BaseOp = X86ISD::DEC; 13039 Cond = X86::COND_O; 13040 break; 13041 } 13042 BaseOp = X86ISD::SUB; 13043 Cond = X86::COND_O; 13044 break; 13045 case ISD::USUBO: 13046 BaseOp = X86ISD::SUB; 13047 Cond = X86::COND_B; 13048 break; 13049 case ISD::SMULO: 13050 BaseOp = X86ISD::SMUL; 13051 Cond = X86::COND_O; 13052 break; 13053 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 13054 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 13055 MVT::i32); 13056 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 13057 13058 SDValue SetCC = 13059 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13060 DAG.getConstant(X86::COND_O, MVT::i32), 13061 SDValue(Sum.getNode(), 2)); 13062 13063 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13064 } 13065 } 13066 13067 // Also sets EFLAGS. 13068 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 13069 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 13070 13071 SDValue SetCC = 13072 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 13073 DAG.getConstant(Cond, MVT::i32), 13074 SDValue(Sum.getNode(), 1)); 13075 13076 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13077} 13078 13079SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 13080 SelectionDAG &DAG) const { 13081 SDLoc dl(Op); 13082 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 13083 EVT VT = Op.getValueType(); 13084 13085 if (!Subtarget->hasSSE2() || !VT.isVector()) 13086 return SDValue(); 13087 13088 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 13089 ExtraVT.getScalarType().getSizeInBits(); 13090 13091 switch (VT.getSimpleVT().SimpleTy) { 13092 default: return SDValue(); 13093 case MVT::v8i32: 13094 case MVT::v16i16: 13095 if (!Subtarget->hasFp256()) 13096 return SDValue(); 13097 if (!Subtarget->hasInt256()) { 13098 // needs to be split 13099 unsigned NumElems = VT.getVectorNumElements(); 13100 13101 // Extract the LHS vectors 13102 SDValue LHS = Op.getOperand(0); 13103 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 13104 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 13105 13106 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 13107 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 13108 13109 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 13110 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 13111 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 13112 ExtraNumElems/2); 13113 SDValue Extra = DAG.getValueType(ExtraVT); 13114 13115 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 13116 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 13117 13118 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 13119 } 13120 // fall through 13121 case MVT::v4i32: 13122 case MVT::v8i16: { 13123 SDValue Op0 = Op.getOperand(0); 13124 SDValue Op00 = Op0.getOperand(0); 13125 SDValue Tmp1; 13126 // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. 13127 if (Op0.getOpcode() == ISD::BITCAST && 13128 Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { 13129 // (sext (vzext x)) -> (vsext x) 13130 Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); 13131 if (Tmp1.getNode()) { 13132 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 13133 // This folding is only valid when the in-reg type is a vector of i8, 13134 // i16, or i32. 13135 if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || 13136 ExtraEltVT == MVT::i32) { 13137 SDValue Tmp1Op0 = Tmp1.getOperand(0); 13138 assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && 13139 "This optimization is invalid without a VZEXT."); 13140 return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); 13141 } 13142 Op0 = Tmp1; 13143 } 13144 } 13145 13146 // If the above didn't work, then just use Shift-Left + Shift-Right. 13147 Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, 13148 DAG); 13149 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, 13150 DAG); 13151 } 13152 } 13153} 13154 13155static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 13156 SelectionDAG &DAG) { 13157 SDLoc dl(Op); 13158 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 13159 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 13160 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 13161 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 13162 13163 // The only fence that needs an instruction is a sequentially-consistent 13164 // cross-thread fence. 13165 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 13166 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 13167 // no-sse2). There isn't any reason to disable it if the target processor 13168 // supports it. 13169 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 13170 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 13171 13172 SDValue Chain = Op.getOperand(0); 13173 SDValue Zero = DAG.getConstant(0, MVT::i32); 13174 SDValue Ops[] = { 13175 DAG.getRegister(X86::ESP, MVT::i32), // Base 13176 DAG.getTargetConstant(1, MVT::i8), // Scale 13177 DAG.getRegister(0, MVT::i32), // Index 13178 DAG.getTargetConstant(0, MVT::i32), // Disp 13179 DAG.getRegister(0, MVT::i32), // Segment. 13180 Zero, 13181 Chain 13182 }; 13183 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 13184 return SDValue(Res, 0); 13185 } 13186 13187 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 13188 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 13189} 13190 13191static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 13192 SelectionDAG &DAG) { 13193 EVT T = Op.getValueType(); 13194 SDLoc DL(Op); 13195 unsigned Reg = 0; 13196 unsigned size = 0; 13197 switch(T.getSimpleVT().SimpleTy) { 13198 default: llvm_unreachable("Invalid value type!"); 13199 case MVT::i8: Reg = X86::AL; size = 1; break; 13200 case MVT::i16: Reg = X86::AX; size = 2; break; 13201 case MVT::i32: Reg = X86::EAX; size = 4; break; 13202 case MVT::i64: 13203 assert(Subtarget->is64Bit() && "Node not type legal!"); 13204 Reg = X86::RAX; size = 8; 13205 break; 13206 } 13207 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 13208 Op.getOperand(2), SDValue()); 13209 SDValue Ops[] = { cpIn.getValue(0), 13210 Op.getOperand(1), 13211 Op.getOperand(3), 13212 DAG.getTargetConstant(size, MVT::i8), 13213 cpIn.getValue(1) }; 13214 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13215 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 13216 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 13217 Ops, array_lengthof(Ops), T, MMO); 13218 SDValue cpOut = 13219 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 13220 return cpOut; 13221} 13222 13223static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 13224 SelectionDAG &DAG) { 13225 assert(Subtarget->is64Bit() && "Result not type legalized?"); 13226 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13227 SDValue TheChain = Op.getOperand(0); 13228 SDLoc dl(Op); 13229 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13230 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 13231 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 13232 rax.getValue(2)); 13233 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 13234 DAG.getConstant(32, MVT::i8)); 13235 SDValue Ops[] = { 13236 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 13237 rdx.getValue(1) 13238 }; 13239 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 13240} 13241 13242static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 13243 SelectionDAG &DAG) { 13244 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 13245 MVT DstVT = Op.getSimpleValueType(); 13246 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 13247 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 13248 assert((DstVT == MVT::i64 || 13249 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 13250 "Unexpected custom BITCAST"); 13251 // i64 <=> MMX conversions are Legal. 13252 if (SrcVT==MVT::i64 && DstVT.isVector()) 13253 return Op; 13254 if (DstVT==MVT::i64 && SrcVT.isVector()) 13255 return Op; 13256 // MMX <=> MMX conversions are Legal. 13257 if (SrcVT.isVector() && DstVT.isVector()) 13258 return Op; 13259 // All other conversions need to be expanded. 13260 return SDValue(); 13261} 13262 13263static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 13264 SDNode *Node = Op.getNode(); 13265 SDLoc dl(Node); 13266 EVT T = Node->getValueType(0); 13267 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 13268 DAG.getConstant(0, T), Node->getOperand(2)); 13269 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 13270 cast<AtomicSDNode>(Node)->getMemoryVT(), 13271 Node->getOperand(0), 13272 Node->getOperand(1), negOp, 13273 cast<AtomicSDNode>(Node)->getSrcValue(), 13274 cast<AtomicSDNode>(Node)->getAlignment(), 13275 cast<AtomicSDNode>(Node)->getOrdering(), 13276 cast<AtomicSDNode>(Node)->getSynchScope()); 13277} 13278 13279static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 13280 SDNode *Node = Op.getNode(); 13281 SDLoc dl(Node); 13282 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13283 13284 // Convert seq_cst store -> xchg 13285 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 13286 // FIXME: On 32-bit, store -> fist or movq would be more efficient 13287 // (The only way to get a 16-byte store is cmpxchg16b) 13288 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 13289 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 13290 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13291 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 13292 cast<AtomicSDNode>(Node)->getMemoryVT(), 13293 Node->getOperand(0), 13294 Node->getOperand(1), Node->getOperand(2), 13295 cast<AtomicSDNode>(Node)->getMemOperand(), 13296 cast<AtomicSDNode>(Node)->getOrdering(), 13297 cast<AtomicSDNode>(Node)->getSynchScope()); 13298 return Swap.getValue(1); 13299 } 13300 // Other atomic stores have a simple pattern. 13301 return Op; 13302} 13303 13304static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 13305 EVT VT = Op.getNode()->getValueType(0); 13306 13307 // Let legalize expand this if it isn't a legal type yet. 13308 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13309 return SDValue(); 13310 13311 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 13312 13313 unsigned Opc; 13314 bool ExtraOp = false; 13315 switch (Op.getOpcode()) { 13316 default: llvm_unreachable("Invalid code"); 13317 case ISD::ADDC: Opc = X86ISD::ADD; break; 13318 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 13319 case ISD::SUBC: Opc = X86ISD::SUB; break; 13320 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 13321 } 13322 13323 if (!ExtraOp) 13324 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13325 Op.getOperand(1)); 13326 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13327 Op.getOperand(1), Op.getOperand(2)); 13328} 13329 13330static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 13331 SelectionDAG &DAG) { 13332 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 13333 13334 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 13335 // which returns the values as { float, float } (in XMM0) or 13336 // { double, double } (which is returned in XMM0, XMM1). 13337 SDLoc dl(Op); 13338 SDValue Arg = Op.getOperand(0); 13339 EVT ArgVT = Arg.getValueType(); 13340 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 13341 13342 TargetLowering::ArgListTy Args; 13343 TargetLowering::ArgListEntry Entry; 13344 13345 Entry.Node = Arg; 13346 Entry.Ty = ArgTy; 13347 Entry.isSExt = false; 13348 Entry.isZExt = false; 13349 Args.push_back(Entry); 13350 13351 bool isF64 = ArgVT == MVT::f64; 13352 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 13353 // the small struct {f32, f32} is returned in (eax, edx). For f64, 13354 // the results are returned via SRet in memory. 13355 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 13356 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13357 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); 13358 13359 Type *RetTy = isF64 13360 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) 13361 : (Type*)VectorType::get(ArgTy, 4); 13362 TargetLowering:: 13363 CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, 13364 false, false, false, false, 0, 13365 CallingConv::C, /*isTaillCall=*/false, 13366 /*doesNotRet=*/false, /*isReturnValueUsed*/true, 13367 Callee, Args, DAG, dl); 13368 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 13369 13370 if (isF64) 13371 // Returned in xmm0 and xmm1. 13372 return CallResult.first; 13373 13374 // Returned in bits 0:31 and 32:64 xmm0. 13375 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13376 CallResult.first, DAG.getIntPtrConstant(0)); 13377 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13378 CallResult.first, DAG.getIntPtrConstant(1)); 13379 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 13380 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 13381} 13382 13383/// LowerOperation - Provide custom lowering hooks for some operations. 13384/// 13385SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 13386 switch (Op.getOpcode()) { 13387 default: llvm_unreachable("Should not custom lower this!"); 13388 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 13389 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 13390 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); 13391 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 13392 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 13393 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 13394 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 13395 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 13396 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 13397 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 13398 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 13399 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 13400 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 13401 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 13402 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 13403 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 13404 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 13405 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 13406 case ISD::SHL_PARTS: 13407 case ISD::SRA_PARTS: 13408 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 13409 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 13410 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 13411 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 13412 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 13413 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 13414 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 13415 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 13416 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 13417 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 13418 case ISD::FABS: return LowerFABS(Op, DAG); 13419 case ISD::FNEG: return LowerFNEG(Op, DAG); 13420 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 13421 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 13422 case ISD::SETCC: return LowerSETCC(Op, DAG); 13423 case ISD::SELECT: return LowerSELECT(Op, DAG); 13424 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 13425 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 13426 case ISD::VASTART: return LowerVASTART(Op, DAG); 13427 case ISD::VAARG: return LowerVAARG(Op, DAG); 13428 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 13429 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 13430 case ISD::INTRINSIC_VOID: 13431 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 13432 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 13433 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 13434 case ISD::FRAME_TO_ARGS_OFFSET: 13435 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 13436 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 13437 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 13438 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 13439 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 13440 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 13441 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 13442 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 13443 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 13444 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 13445 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 13446 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 13447 case ISD::SRA: 13448 case ISD::SRL: 13449 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 13450 case ISD::SADDO: 13451 case ISD::UADDO: 13452 case ISD::SSUBO: 13453 case ISD::USUBO: 13454 case ISD::SMULO: 13455 case ISD::UMULO: return LowerXALUO(Op, DAG); 13456 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 13457 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 13458 case ISD::ADDC: 13459 case ISD::ADDE: 13460 case ISD::SUBC: 13461 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 13462 case ISD::ADD: return LowerADD(Op, DAG); 13463 case ISD::SUB: return LowerSUB(Op, DAG); 13464 case ISD::SDIV: return LowerSDIV(Op, DAG); 13465 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 13466 } 13467} 13468 13469static void ReplaceATOMIC_LOAD(SDNode *Node, 13470 SmallVectorImpl<SDValue> &Results, 13471 SelectionDAG &DAG) { 13472 SDLoc dl(Node); 13473 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13474 13475 // Convert wide load -> cmpxchg8b/cmpxchg16b 13476 // FIXME: On 32-bit, load -> fild or movq would be more efficient 13477 // (The only way to get a 16-byte load is cmpxchg16b) 13478 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 13479 SDValue Zero = DAG.getConstant(0, VT); 13480 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 13481 Node->getOperand(0), 13482 Node->getOperand(1), Zero, Zero, 13483 cast<AtomicSDNode>(Node)->getMemOperand(), 13484 cast<AtomicSDNode>(Node)->getOrdering(), 13485 cast<AtomicSDNode>(Node)->getSynchScope()); 13486 Results.push_back(Swap.getValue(0)); 13487 Results.push_back(Swap.getValue(1)); 13488} 13489 13490static void 13491ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 13492 SelectionDAG &DAG, unsigned NewOp) { 13493 SDLoc dl(Node); 13494 assert (Node->getValueType(0) == MVT::i64 && 13495 "Only know how to expand i64 atomics"); 13496 13497 SDValue Chain = Node->getOperand(0); 13498 SDValue In1 = Node->getOperand(1); 13499 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13500 Node->getOperand(2), DAG.getIntPtrConstant(0)); 13501 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13502 Node->getOperand(2), DAG.getIntPtrConstant(1)); 13503 SDValue Ops[] = { Chain, In1, In2L, In2H }; 13504 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 13505 SDValue Result = 13506 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, 13507 cast<MemSDNode>(Node)->getMemOperand()); 13508 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 13509 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 13510 Results.push_back(Result.getValue(2)); 13511} 13512 13513/// ReplaceNodeResults - Replace a node with an illegal result type 13514/// with a new node built out of custom code. 13515void X86TargetLowering::ReplaceNodeResults(SDNode *N, 13516 SmallVectorImpl<SDValue>&Results, 13517 SelectionDAG &DAG) const { 13518 SDLoc dl(N); 13519 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13520 switch (N->getOpcode()) { 13521 default: 13522 llvm_unreachable("Do not know how to custom type legalize this operation!"); 13523 case ISD::SIGN_EXTEND_INREG: 13524 case ISD::ADDC: 13525 case ISD::ADDE: 13526 case ISD::SUBC: 13527 case ISD::SUBE: 13528 // We don't want to expand or promote these. 13529 return; 13530 case ISD::FP_TO_SINT: 13531 case ISD::FP_TO_UINT: { 13532 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 13533 13534 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 13535 return; 13536 13537 std::pair<SDValue,SDValue> Vals = 13538 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 13539 SDValue FIST = Vals.first, StackSlot = Vals.second; 13540 if (FIST.getNode() != 0) { 13541 EVT VT = N->getValueType(0); 13542 // Return a load from the stack slot. 13543 if (StackSlot.getNode() != 0) 13544 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 13545 MachinePointerInfo(), 13546 false, false, false, 0)); 13547 else 13548 Results.push_back(FIST); 13549 } 13550 return; 13551 } 13552 case ISD::UINT_TO_FP: { 13553 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 13554 if (N->getOperand(0).getValueType() != MVT::v2i32 || 13555 N->getValueType(0) != MVT::v2f32) 13556 return; 13557 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 13558 N->getOperand(0)); 13559 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 13560 MVT::f64); 13561 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 13562 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 13563 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 13564 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 13565 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 13566 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 13567 return; 13568 } 13569 case ISD::FP_ROUND: { 13570 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 13571 return; 13572 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 13573 Results.push_back(V); 13574 return; 13575 } 13576 case ISD::READCYCLECOUNTER: { 13577 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13578 SDValue TheChain = N->getOperand(0); 13579 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13580 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 13581 rd.getValue(1)); 13582 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 13583 eax.getValue(2)); 13584 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 13585 SDValue Ops[] = { eax, edx }; 13586 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 13587 array_lengthof(Ops))); 13588 Results.push_back(edx.getValue(1)); 13589 return; 13590 } 13591 case ISD::ATOMIC_CMP_SWAP: { 13592 EVT T = N->getValueType(0); 13593 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 13594 bool Regs64bit = T == MVT::i128; 13595 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 13596 SDValue cpInL, cpInH; 13597 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13598 DAG.getConstant(0, HalfT)); 13599 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13600 DAG.getConstant(1, HalfT)); 13601 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 13602 Regs64bit ? X86::RAX : X86::EAX, 13603 cpInL, SDValue()); 13604 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 13605 Regs64bit ? X86::RDX : X86::EDX, 13606 cpInH, cpInL.getValue(1)); 13607 SDValue swapInL, swapInH; 13608 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13609 DAG.getConstant(0, HalfT)); 13610 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13611 DAG.getConstant(1, HalfT)); 13612 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 13613 Regs64bit ? X86::RBX : X86::EBX, 13614 swapInL, cpInH.getValue(1)); 13615 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 13616 Regs64bit ? X86::RCX : X86::ECX, 13617 swapInH, swapInL.getValue(1)); 13618 SDValue Ops[] = { swapInH.getValue(0), 13619 N->getOperand(1), 13620 swapInH.getValue(1) }; 13621 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13622 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 13623 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 13624 X86ISD::LCMPXCHG8_DAG; 13625 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 13626 Ops, array_lengthof(Ops), T, MMO); 13627 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 13628 Regs64bit ? X86::RAX : X86::EAX, 13629 HalfT, Result.getValue(1)); 13630 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 13631 Regs64bit ? X86::RDX : X86::EDX, 13632 HalfT, cpOutL.getValue(2)); 13633 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 13634 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 13635 Results.push_back(cpOutH.getValue(1)); 13636 return; 13637 } 13638 case ISD::ATOMIC_LOAD_ADD: 13639 case ISD::ATOMIC_LOAD_AND: 13640 case ISD::ATOMIC_LOAD_NAND: 13641 case ISD::ATOMIC_LOAD_OR: 13642 case ISD::ATOMIC_LOAD_SUB: 13643 case ISD::ATOMIC_LOAD_XOR: 13644 case ISD::ATOMIC_LOAD_MAX: 13645 case ISD::ATOMIC_LOAD_MIN: 13646 case ISD::ATOMIC_LOAD_UMAX: 13647 case ISD::ATOMIC_LOAD_UMIN: 13648 case ISD::ATOMIC_SWAP: { 13649 unsigned Opc; 13650 switch (N->getOpcode()) { 13651 default: llvm_unreachable("Unexpected opcode"); 13652 case ISD::ATOMIC_LOAD_ADD: 13653 Opc = X86ISD::ATOMADD64_DAG; 13654 break; 13655 case ISD::ATOMIC_LOAD_AND: 13656 Opc = X86ISD::ATOMAND64_DAG; 13657 break; 13658 case ISD::ATOMIC_LOAD_NAND: 13659 Opc = X86ISD::ATOMNAND64_DAG; 13660 break; 13661 case ISD::ATOMIC_LOAD_OR: 13662 Opc = X86ISD::ATOMOR64_DAG; 13663 break; 13664 case ISD::ATOMIC_LOAD_SUB: 13665 Opc = X86ISD::ATOMSUB64_DAG; 13666 break; 13667 case ISD::ATOMIC_LOAD_XOR: 13668 Opc = X86ISD::ATOMXOR64_DAG; 13669 break; 13670 case ISD::ATOMIC_LOAD_MAX: 13671 Opc = X86ISD::ATOMMAX64_DAG; 13672 break; 13673 case ISD::ATOMIC_LOAD_MIN: 13674 Opc = X86ISD::ATOMMIN64_DAG; 13675 break; 13676 case ISD::ATOMIC_LOAD_UMAX: 13677 Opc = X86ISD::ATOMUMAX64_DAG; 13678 break; 13679 case ISD::ATOMIC_LOAD_UMIN: 13680 Opc = X86ISD::ATOMUMIN64_DAG; 13681 break; 13682 case ISD::ATOMIC_SWAP: 13683 Opc = X86ISD::ATOMSWAP64_DAG; 13684 break; 13685 } 13686 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 13687 return; 13688 } 13689 case ISD::ATOMIC_LOAD: 13690 ReplaceATOMIC_LOAD(N, Results, DAG); 13691 } 13692} 13693 13694const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 13695 switch (Opcode) { 13696 default: return NULL; 13697 case X86ISD::BSF: return "X86ISD::BSF"; 13698 case X86ISD::BSR: return "X86ISD::BSR"; 13699 case X86ISD::SHLD: return "X86ISD::SHLD"; 13700 case X86ISD::SHRD: return "X86ISD::SHRD"; 13701 case X86ISD::FAND: return "X86ISD::FAND"; 13702 case X86ISD::FANDN: return "X86ISD::FANDN"; 13703 case X86ISD::FOR: return "X86ISD::FOR"; 13704 case X86ISD::FXOR: return "X86ISD::FXOR"; 13705 case X86ISD::FSRL: return "X86ISD::FSRL"; 13706 case X86ISD::FILD: return "X86ISD::FILD"; 13707 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 13708 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 13709 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 13710 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 13711 case X86ISD::FLD: return "X86ISD::FLD"; 13712 case X86ISD::FST: return "X86ISD::FST"; 13713 case X86ISD::CALL: return "X86ISD::CALL"; 13714 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 13715 case X86ISD::BT: return "X86ISD::BT"; 13716 case X86ISD::CMP: return "X86ISD::CMP"; 13717 case X86ISD::COMI: return "X86ISD::COMI"; 13718 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 13719 case X86ISD::CMPM: return "X86ISD::CMPM"; 13720 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 13721 case X86ISD::SETCC: return "X86ISD::SETCC"; 13722 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 13723 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 13724 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 13725 case X86ISD::CMOV: return "X86ISD::CMOV"; 13726 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 13727 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 13728 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 13729 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 13730 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 13731 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 13732 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 13733 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 13734 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 13735 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 13736 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 13737 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 13738 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 13739 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 13740 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 13741 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 13742 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 13743 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 13744 case X86ISD::HADD: return "X86ISD::HADD"; 13745 case X86ISD::HSUB: return "X86ISD::HSUB"; 13746 case X86ISD::FHADD: return "X86ISD::FHADD"; 13747 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 13748 case X86ISD::UMAX: return "X86ISD::UMAX"; 13749 case X86ISD::UMIN: return "X86ISD::UMIN"; 13750 case X86ISD::SMAX: return "X86ISD::SMAX"; 13751 case X86ISD::SMIN: return "X86ISD::SMIN"; 13752 case X86ISD::FMAX: return "X86ISD::FMAX"; 13753 case X86ISD::FMIN: return "X86ISD::FMIN"; 13754 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 13755 case X86ISD::FMINC: return "X86ISD::FMINC"; 13756 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 13757 case X86ISD::FRCP: return "X86ISD::FRCP"; 13758 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 13759 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 13760 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 13761 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 13762 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 13763 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 13764 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 13765 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 13766 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 13767 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 13768 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 13769 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 13770 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 13771 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 13772 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 13773 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 13774 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 13775 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 13776 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 13777 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 13778 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 13779 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 13780 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 13781 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; 13782 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 13783 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 13784 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 13785 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 13786 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 13787 case X86ISD::VSHL: return "X86ISD::VSHL"; 13788 case X86ISD::VSRL: return "X86ISD::VSRL"; 13789 case X86ISD::VSRA: return "X86ISD::VSRA"; 13790 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 13791 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 13792 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 13793 case X86ISD::CMPP: return "X86ISD::CMPP"; 13794 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 13795 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 13796 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 13797 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 13798 case X86ISD::ADD: return "X86ISD::ADD"; 13799 case X86ISD::SUB: return "X86ISD::SUB"; 13800 case X86ISD::ADC: return "X86ISD::ADC"; 13801 case X86ISD::SBB: return "X86ISD::SBB"; 13802 case X86ISD::SMUL: return "X86ISD::SMUL"; 13803 case X86ISD::UMUL: return "X86ISD::UMUL"; 13804 case X86ISD::INC: return "X86ISD::INC"; 13805 case X86ISD::DEC: return "X86ISD::DEC"; 13806 case X86ISD::OR: return "X86ISD::OR"; 13807 case X86ISD::XOR: return "X86ISD::XOR"; 13808 case X86ISD::AND: return "X86ISD::AND"; 13809 case X86ISD::BLSI: return "X86ISD::BLSI"; 13810 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 13811 case X86ISD::BLSR: return "X86ISD::BLSR"; 13812 case X86ISD::BZHI: return "X86ISD::BZHI"; 13813 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 13814 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 13815 case X86ISD::PTEST: return "X86ISD::PTEST"; 13816 case X86ISD::TESTP: return "X86ISD::TESTP"; 13817 case X86ISD::TESTM: return "X86ISD::TESTM"; 13818 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 13819 case X86ISD::KTEST: return "X86ISD::KTEST"; 13820 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 13821 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 13822 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 13823 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 13824 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 13825 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 13826 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 13827 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 13828 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 13829 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 13830 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 13831 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 13832 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 13833 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 13834 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 13835 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 13836 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 13837 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 13838 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 13839 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 13840 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 13841 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 13842 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 13843 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 13844 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 13845 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 13846 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 13847 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 13848 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 13849 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 13850 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 13851 case X86ISD::SAHF: return "X86ISD::SAHF"; 13852 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 13853 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 13854 case X86ISD::FMADD: return "X86ISD::FMADD"; 13855 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 13856 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 13857 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 13858 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 13859 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 13860 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 13861 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 13862 case X86ISD::XTEST: return "X86ISD::XTEST"; 13863 } 13864} 13865 13866// isLegalAddressingMode - Return true if the addressing mode represented 13867// by AM is legal for this target, for a load/store of the specified type. 13868bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 13869 Type *Ty) const { 13870 // X86 supports extremely general addressing modes. 13871 CodeModel::Model M = getTargetMachine().getCodeModel(); 13872 Reloc::Model R = getTargetMachine().getRelocationModel(); 13873 13874 // X86 allows a sign-extended 32-bit immediate field as a displacement. 13875 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 13876 return false; 13877 13878 if (AM.BaseGV) { 13879 unsigned GVFlags = 13880 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 13881 13882 // If a reference to this global requires an extra load, we can't fold it. 13883 if (isGlobalStubReference(GVFlags)) 13884 return false; 13885 13886 // If BaseGV requires a register for the PIC base, we cannot also have a 13887 // BaseReg specified. 13888 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 13889 return false; 13890 13891 // If lower 4G is not available, then we must use rip-relative addressing. 13892 if ((M != CodeModel::Small || R != Reloc::Static) && 13893 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 13894 return false; 13895 } 13896 13897 switch (AM.Scale) { 13898 case 0: 13899 case 1: 13900 case 2: 13901 case 4: 13902 case 8: 13903 // These scales always work. 13904 break; 13905 case 3: 13906 case 5: 13907 case 9: 13908 // These scales are formed with basereg+scalereg. Only accept if there is 13909 // no basereg yet. 13910 if (AM.HasBaseReg) 13911 return false; 13912 break; 13913 default: // Other stuff never works. 13914 return false; 13915 } 13916 13917 return true; 13918} 13919 13920bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13921 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13922 return false; 13923 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13924 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13925 return NumBits1 > NumBits2; 13926} 13927 13928bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 13929 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13930 return false; 13931 13932 if (!isTypeLegal(EVT::getEVT(Ty1))) 13933 return false; 13934 13935 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 13936 13937 // Assuming the caller doesn't have a zeroext or signext return parameter, 13938 // truncation all the way down to i1 is valid. 13939 return true; 13940} 13941 13942bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13943 return isInt<32>(Imm); 13944} 13945 13946bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 13947 // Can also use sub to handle negated immediates. 13948 return isInt<32>(Imm); 13949} 13950 13951bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13952 if (!VT1.isInteger() || !VT2.isInteger()) 13953 return false; 13954 unsigned NumBits1 = VT1.getSizeInBits(); 13955 unsigned NumBits2 = VT2.getSizeInBits(); 13956 return NumBits1 > NumBits2; 13957} 13958 13959bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 13960 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13961 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 13962} 13963 13964bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 13965 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13966 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 13967} 13968 13969bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13970 EVT VT1 = Val.getValueType(); 13971 if (isZExtFree(VT1, VT2)) 13972 return true; 13973 13974 if (Val.getOpcode() != ISD::LOAD) 13975 return false; 13976 13977 if (!VT1.isSimple() || !VT1.isInteger() || 13978 !VT2.isSimple() || !VT2.isInteger()) 13979 return false; 13980 13981 switch (VT1.getSimpleVT().SimpleTy) { 13982 default: break; 13983 case MVT::i8: 13984 case MVT::i16: 13985 case MVT::i32: 13986 // X86 has 8, 16, and 32-bit zero-extending loads. 13987 return true; 13988 } 13989 13990 return false; 13991} 13992 13993bool 13994X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 13995 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) 13996 return false; 13997 13998 VT = VT.getScalarType(); 13999 14000 if (!VT.isSimple()) 14001 return false; 14002 14003 switch (VT.getSimpleVT().SimpleTy) { 14004 case MVT::f32: 14005 case MVT::f64: 14006 return true; 14007 default: 14008 break; 14009 } 14010 14011 return false; 14012} 14013 14014bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 14015 // i16 instructions are longer (0x66 prefix) and potentially slower. 14016 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 14017} 14018 14019/// isShuffleMaskLegal - Targets can use this to indicate that they only 14020/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 14021/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 14022/// are assumed to be legal. 14023bool 14024X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 14025 EVT VT) const { 14026 if (!VT.isSimple()) 14027 return false; 14028 14029 MVT SVT = VT.getSimpleVT(); 14030 14031 // Very little shuffling can be done for 64-bit vectors right now. 14032 if (VT.getSizeInBits() == 64) 14033 return false; 14034 14035 // FIXME: pshufb, blends, shifts. 14036 return (SVT.getVectorNumElements() == 2 || 14037 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 14038 isMOVLMask(M, SVT) || 14039 isSHUFPMask(M, SVT) || 14040 isPSHUFDMask(M, SVT) || 14041 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || 14042 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || 14043 isPALIGNRMask(M, SVT, Subtarget) || 14044 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || 14045 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || 14046 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || 14047 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); 14048} 14049 14050bool 14051X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 14052 EVT VT) const { 14053 if (!VT.isSimple()) 14054 return false; 14055 14056 MVT SVT = VT.getSimpleVT(); 14057 unsigned NumElts = SVT.getVectorNumElements(); 14058 // FIXME: This collection of masks seems suspect. 14059 if (NumElts == 2) 14060 return true; 14061 if (NumElts == 4 && SVT.is128BitVector()) { 14062 return (isMOVLMask(Mask, SVT) || 14063 isCommutedMOVLMask(Mask, SVT, true) || 14064 isSHUFPMask(Mask, SVT) || 14065 isSHUFPMask(Mask, SVT, /* Commuted */ true)); 14066 } 14067 return false; 14068} 14069 14070//===----------------------------------------------------------------------===// 14071// X86 Scheduler Hooks 14072//===----------------------------------------------------------------------===// 14073 14074/// Utility function to emit xbegin specifying the start of an RTM region. 14075static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 14076 const TargetInstrInfo *TII) { 14077 DebugLoc DL = MI->getDebugLoc(); 14078 14079 const BasicBlock *BB = MBB->getBasicBlock(); 14080 MachineFunction::iterator I = MBB; 14081 ++I; 14082 14083 // For the v = xbegin(), we generate 14084 // 14085 // thisMBB: 14086 // xbegin sinkMBB 14087 // 14088 // mainMBB: 14089 // eax = -1 14090 // 14091 // sinkMBB: 14092 // v = eax 14093 14094 MachineBasicBlock *thisMBB = MBB; 14095 MachineFunction *MF = MBB->getParent(); 14096 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14097 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14098 MF->insert(I, mainMBB); 14099 MF->insert(I, sinkMBB); 14100 14101 // Transfer the remainder of BB and its successor edges to sinkMBB. 14102 sinkMBB->splice(sinkMBB->begin(), MBB, 14103 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14104 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14105 14106 // thisMBB: 14107 // xbegin sinkMBB 14108 // # fallthrough to mainMBB 14109 // # abortion to sinkMBB 14110 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 14111 thisMBB->addSuccessor(mainMBB); 14112 thisMBB->addSuccessor(sinkMBB); 14113 14114 // mainMBB: 14115 // EAX = -1 14116 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 14117 mainMBB->addSuccessor(sinkMBB); 14118 14119 // sinkMBB: 14120 // EAX is live into the sinkMBB 14121 sinkMBB->addLiveIn(X86::EAX); 14122 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14123 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14124 .addReg(X86::EAX); 14125 14126 MI->eraseFromParent(); 14127 return sinkMBB; 14128} 14129 14130// Get CMPXCHG opcode for the specified data type. 14131static unsigned getCmpXChgOpcode(EVT VT) { 14132 switch (VT.getSimpleVT().SimpleTy) { 14133 case MVT::i8: return X86::LCMPXCHG8; 14134 case MVT::i16: return X86::LCMPXCHG16; 14135 case MVT::i32: return X86::LCMPXCHG32; 14136 case MVT::i64: return X86::LCMPXCHG64; 14137 default: 14138 break; 14139 } 14140 llvm_unreachable("Invalid operand size!"); 14141} 14142 14143// Get LOAD opcode for the specified data type. 14144static unsigned getLoadOpcode(EVT VT) { 14145 switch (VT.getSimpleVT().SimpleTy) { 14146 case MVT::i8: return X86::MOV8rm; 14147 case MVT::i16: return X86::MOV16rm; 14148 case MVT::i32: return X86::MOV32rm; 14149 case MVT::i64: return X86::MOV64rm; 14150 default: 14151 break; 14152 } 14153 llvm_unreachable("Invalid operand size!"); 14154} 14155 14156// Get opcode of the non-atomic one from the specified atomic instruction. 14157static unsigned getNonAtomicOpcode(unsigned Opc) { 14158 switch (Opc) { 14159 case X86::ATOMAND8: return X86::AND8rr; 14160 case X86::ATOMAND16: return X86::AND16rr; 14161 case X86::ATOMAND32: return X86::AND32rr; 14162 case X86::ATOMAND64: return X86::AND64rr; 14163 case X86::ATOMOR8: return X86::OR8rr; 14164 case X86::ATOMOR16: return X86::OR16rr; 14165 case X86::ATOMOR32: return X86::OR32rr; 14166 case X86::ATOMOR64: return X86::OR64rr; 14167 case X86::ATOMXOR8: return X86::XOR8rr; 14168 case X86::ATOMXOR16: return X86::XOR16rr; 14169 case X86::ATOMXOR32: return X86::XOR32rr; 14170 case X86::ATOMXOR64: return X86::XOR64rr; 14171 } 14172 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14173} 14174 14175// Get opcode of the non-atomic one from the specified atomic instruction with 14176// extra opcode. 14177static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, 14178 unsigned &ExtraOpc) { 14179 switch (Opc) { 14180 case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; 14181 case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; 14182 case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; 14183 case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; 14184 case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; 14185 case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; 14186 case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; 14187 case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; 14188 case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; 14189 case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; 14190 case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; 14191 case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; 14192 case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; 14193 case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; 14194 case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; 14195 case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; 14196 case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; 14197 case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; 14198 case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; 14199 case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; 14200 } 14201 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14202} 14203 14204// Get opcode of the non-atomic one from the specified atomic instruction for 14205// 64-bit data type on 32-bit target. 14206static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { 14207 switch (Opc) { 14208 case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; 14209 case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; 14210 case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; 14211 case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; 14212 case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; 14213 case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; 14214 case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; 14215 case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; 14216 case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; 14217 case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; 14218 } 14219 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14220} 14221 14222// Get opcode of the non-atomic one from the specified atomic instruction for 14223// 64-bit data type on 32-bit target with extra opcode. 14224static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, 14225 unsigned &HiOpc, 14226 unsigned &ExtraOpc) { 14227 switch (Opc) { 14228 case X86::ATOMNAND6432: 14229 ExtraOpc = X86::NOT32r; 14230 HiOpc = X86::AND32rr; 14231 return X86::AND32rr; 14232 } 14233 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14234} 14235 14236// Get pseudo CMOV opcode from the specified data type. 14237static unsigned getPseudoCMOVOpc(EVT VT) { 14238 switch (VT.getSimpleVT().SimpleTy) { 14239 case MVT::i8: return X86::CMOV_GR8; 14240 case MVT::i16: return X86::CMOV_GR16; 14241 case MVT::i32: return X86::CMOV_GR32; 14242 default: 14243 break; 14244 } 14245 llvm_unreachable("Unknown CMOV opcode!"); 14246} 14247 14248// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. 14249// They will be translated into a spin-loop or compare-exchange loop from 14250// 14251// ... 14252// dst = atomic-fetch-op MI.addr, MI.val 14253// ... 14254// 14255// to 14256// 14257// ... 14258// t1 = LOAD MI.addr 14259// loop: 14260// t4 = phi(t1, t3 / loop) 14261// t2 = OP MI.val, t4 14262// EAX = t4 14263// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] 14264// t3 = EAX 14265// JNE loop 14266// sink: 14267// dst = t3 14268// ... 14269MachineBasicBlock * 14270X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, 14271 MachineBasicBlock *MBB) const { 14272 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14273 DebugLoc DL = MI->getDebugLoc(); 14274 14275 MachineFunction *MF = MBB->getParent(); 14276 MachineRegisterInfo &MRI = MF->getRegInfo(); 14277 14278 const BasicBlock *BB = MBB->getBasicBlock(); 14279 MachineFunction::iterator I = MBB; 14280 ++I; 14281 14282 assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && 14283 "Unexpected number of operands"); 14284 14285 assert(MI->hasOneMemOperand() && 14286 "Expected atomic-load-op to have one memoperand"); 14287 14288 // Memory Reference 14289 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14290 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14291 14292 unsigned DstReg, SrcReg; 14293 unsigned MemOpndSlot; 14294 14295 unsigned CurOp = 0; 14296 14297 DstReg = MI->getOperand(CurOp++).getReg(); 14298 MemOpndSlot = CurOp; 14299 CurOp += X86::AddrNumOperands; 14300 SrcReg = MI->getOperand(CurOp++).getReg(); 14301 14302 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 14303 MVT::SimpleValueType VT = *RC->vt_begin(); 14304 unsigned t1 = MRI.createVirtualRegister(RC); 14305 unsigned t2 = MRI.createVirtualRegister(RC); 14306 unsigned t3 = MRI.createVirtualRegister(RC); 14307 unsigned t4 = MRI.createVirtualRegister(RC); 14308 unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); 14309 14310 unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); 14311 unsigned LOADOpc = getLoadOpcode(VT); 14312 14313 // For the atomic load-arith operator, we generate 14314 // 14315 // thisMBB: 14316 // t1 = LOAD [MI.addr] 14317 // mainMBB: 14318 // t4 = phi(t1 / thisMBB, t3 / mainMBB) 14319 // t1 = OP MI.val, EAX 14320 // EAX = t4 14321 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 14322 // t3 = EAX 14323 // JNE mainMBB 14324 // sinkMBB: 14325 // dst = t3 14326 14327 MachineBasicBlock *thisMBB = MBB; 14328 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14329 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14330 MF->insert(I, mainMBB); 14331 MF->insert(I, sinkMBB); 14332 14333 MachineInstrBuilder MIB; 14334 14335 // Transfer the remainder of BB and its successor edges to sinkMBB. 14336 sinkMBB->splice(sinkMBB->begin(), MBB, 14337 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14338 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14339 14340 // thisMBB: 14341 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); 14342 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14343 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14344 if (NewMO.isReg()) 14345 NewMO.setIsKill(false); 14346 MIB.addOperand(NewMO); 14347 } 14348 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14349 unsigned flags = (*MMOI)->getFlags(); 14350 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14351 MachineMemOperand *MMO = 14352 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14353 (*MMOI)->getSize(), 14354 (*MMOI)->getBaseAlignment(), 14355 (*MMOI)->getTBAAInfo(), 14356 (*MMOI)->getRanges()); 14357 MIB.addMemOperand(MMO); 14358 } 14359 14360 thisMBB->addSuccessor(mainMBB); 14361 14362 // mainMBB: 14363 MachineBasicBlock *origMainMBB = mainMBB; 14364 14365 // Add a PHI. 14366 MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) 14367 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14368 14369 unsigned Opc = MI->getOpcode(); 14370 switch (Opc) { 14371 default: 14372 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14373 case X86::ATOMAND8: 14374 case X86::ATOMAND16: 14375 case X86::ATOMAND32: 14376 case X86::ATOMAND64: 14377 case X86::ATOMOR8: 14378 case X86::ATOMOR16: 14379 case X86::ATOMOR32: 14380 case X86::ATOMOR64: 14381 case X86::ATOMXOR8: 14382 case X86::ATOMXOR16: 14383 case X86::ATOMXOR32: 14384 case X86::ATOMXOR64: { 14385 unsigned ARITHOpc = getNonAtomicOpcode(Opc); 14386 BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) 14387 .addReg(t4); 14388 break; 14389 } 14390 case X86::ATOMNAND8: 14391 case X86::ATOMNAND16: 14392 case X86::ATOMNAND32: 14393 case X86::ATOMNAND64: { 14394 unsigned Tmp = MRI.createVirtualRegister(RC); 14395 unsigned NOTOpc; 14396 unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); 14397 BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) 14398 .addReg(t4); 14399 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); 14400 break; 14401 } 14402 case X86::ATOMMAX8: 14403 case X86::ATOMMAX16: 14404 case X86::ATOMMAX32: 14405 case X86::ATOMMAX64: 14406 case X86::ATOMMIN8: 14407 case X86::ATOMMIN16: 14408 case X86::ATOMMIN32: 14409 case X86::ATOMMIN64: 14410 case X86::ATOMUMAX8: 14411 case X86::ATOMUMAX16: 14412 case X86::ATOMUMAX32: 14413 case X86::ATOMUMAX64: 14414 case X86::ATOMUMIN8: 14415 case X86::ATOMUMIN16: 14416 case X86::ATOMUMIN32: 14417 case X86::ATOMUMIN64: { 14418 unsigned CMPOpc; 14419 unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); 14420 14421 BuildMI(mainMBB, DL, TII->get(CMPOpc)) 14422 .addReg(SrcReg) 14423 .addReg(t4); 14424 14425 if (Subtarget->hasCMov()) { 14426 if (VT != MVT::i8) { 14427 // Native support 14428 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) 14429 .addReg(SrcReg) 14430 .addReg(t4); 14431 } else { 14432 // Promote i8 to i32 to use CMOV32 14433 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 14434 const TargetRegisterClass *RC32 = 14435 TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); 14436 unsigned SrcReg32 = MRI.createVirtualRegister(RC32); 14437 unsigned AccReg32 = MRI.createVirtualRegister(RC32); 14438 unsigned Tmp = MRI.createVirtualRegister(RC32); 14439 14440 unsigned Undef = MRI.createVirtualRegister(RC32); 14441 BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); 14442 14443 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) 14444 .addReg(Undef) 14445 .addReg(SrcReg) 14446 .addImm(X86::sub_8bit); 14447 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) 14448 .addReg(Undef) 14449 .addReg(t4) 14450 .addImm(X86::sub_8bit); 14451 14452 BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) 14453 .addReg(SrcReg32) 14454 .addReg(AccReg32); 14455 14456 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) 14457 .addReg(Tmp, 0, X86::sub_8bit); 14458 } 14459 } else { 14460 // Use pseudo select and lower them. 14461 assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && 14462 "Invalid atomic-load-op transformation!"); 14463 unsigned SelOpc = getPseudoCMOVOpc(VT); 14464 X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); 14465 assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); 14466 MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) 14467 .addReg(SrcReg).addReg(t4) 14468 .addImm(CC); 14469 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14470 // Replace the original PHI node as mainMBB is changed after CMOV 14471 // lowering. 14472 BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) 14473 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14474 Phi->eraseFromParent(); 14475 } 14476 break; 14477 } 14478 } 14479 14480 // Copy PhyReg back from virtual register. 14481 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) 14482 .addReg(t4); 14483 14484 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14485 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14486 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14487 if (NewMO.isReg()) 14488 NewMO.setIsKill(false); 14489 MIB.addOperand(NewMO); 14490 } 14491 MIB.addReg(t2); 14492 MIB.setMemRefs(MMOBegin, MMOEnd); 14493 14494 // Copy PhyReg back to virtual register. 14495 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) 14496 .addReg(PhyReg); 14497 14498 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14499 14500 mainMBB->addSuccessor(origMainMBB); 14501 mainMBB->addSuccessor(sinkMBB); 14502 14503 // sinkMBB: 14504 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14505 TII->get(TargetOpcode::COPY), DstReg) 14506 .addReg(t3); 14507 14508 MI->eraseFromParent(); 14509 return sinkMBB; 14510} 14511 14512// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic 14513// instructions. They will be translated into a spin-loop or compare-exchange 14514// loop from 14515// 14516// ... 14517// dst = atomic-fetch-op MI.addr, MI.val 14518// ... 14519// 14520// to 14521// 14522// ... 14523// t1L = LOAD [MI.addr + 0] 14524// t1H = LOAD [MI.addr + 4] 14525// loop: 14526// t4L = phi(t1L, t3L / loop) 14527// t4H = phi(t1H, t3H / loop) 14528// t2L = OP MI.val.lo, t4L 14529// t2H = OP MI.val.hi, t4H 14530// EAX = t4L 14531// EDX = t4H 14532// EBX = t2L 14533// ECX = t2H 14534// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14535// t3L = EAX 14536// t3H = EDX 14537// JNE loop 14538// sink: 14539// dstL = t3L 14540// dstH = t3H 14541// ... 14542MachineBasicBlock * 14543X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, 14544 MachineBasicBlock *MBB) const { 14545 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14546 DebugLoc DL = MI->getDebugLoc(); 14547 14548 MachineFunction *MF = MBB->getParent(); 14549 MachineRegisterInfo &MRI = MF->getRegInfo(); 14550 14551 const BasicBlock *BB = MBB->getBasicBlock(); 14552 MachineFunction::iterator I = MBB; 14553 ++I; 14554 14555 assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && 14556 "Unexpected number of operands"); 14557 14558 assert(MI->hasOneMemOperand() && 14559 "Expected atomic-load-op32 to have one memoperand"); 14560 14561 // Memory Reference 14562 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14563 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14564 14565 unsigned DstLoReg, DstHiReg; 14566 unsigned SrcLoReg, SrcHiReg; 14567 unsigned MemOpndSlot; 14568 14569 unsigned CurOp = 0; 14570 14571 DstLoReg = MI->getOperand(CurOp++).getReg(); 14572 DstHiReg = MI->getOperand(CurOp++).getReg(); 14573 MemOpndSlot = CurOp; 14574 CurOp += X86::AddrNumOperands; 14575 SrcLoReg = MI->getOperand(CurOp++).getReg(); 14576 SrcHiReg = MI->getOperand(CurOp++).getReg(); 14577 14578 const TargetRegisterClass *RC = &X86::GR32RegClass; 14579 const TargetRegisterClass *RC8 = &X86::GR8RegClass; 14580 14581 unsigned t1L = MRI.createVirtualRegister(RC); 14582 unsigned t1H = MRI.createVirtualRegister(RC); 14583 unsigned t2L = MRI.createVirtualRegister(RC); 14584 unsigned t2H = MRI.createVirtualRegister(RC); 14585 unsigned t3L = MRI.createVirtualRegister(RC); 14586 unsigned t3H = MRI.createVirtualRegister(RC); 14587 unsigned t4L = MRI.createVirtualRegister(RC); 14588 unsigned t4H = MRI.createVirtualRegister(RC); 14589 14590 unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; 14591 unsigned LOADOpc = X86::MOV32rm; 14592 14593 // For the atomic load-arith operator, we generate 14594 // 14595 // thisMBB: 14596 // t1L = LOAD [MI.addr + 0] 14597 // t1H = LOAD [MI.addr + 4] 14598 // mainMBB: 14599 // t4L = phi(t1L / thisMBB, t3L / mainMBB) 14600 // t4H = phi(t1H / thisMBB, t3H / mainMBB) 14601 // t2L = OP MI.val.lo, t4L 14602 // t2H = OP MI.val.hi, t4H 14603 // EBX = t2L 14604 // ECX = t2H 14605 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14606 // t3L = EAX 14607 // t3H = EDX 14608 // JNE loop 14609 // sinkMBB: 14610 // dstL = t3L 14611 // dstH = t3H 14612 14613 MachineBasicBlock *thisMBB = MBB; 14614 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14615 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14616 MF->insert(I, mainMBB); 14617 MF->insert(I, sinkMBB); 14618 14619 MachineInstrBuilder MIB; 14620 14621 // Transfer the remainder of BB and its successor edges to sinkMBB. 14622 sinkMBB->splice(sinkMBB->begin(), MBB, 14623 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14624 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14625 14626 // thisMBB: 14627 // Lo 14628 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); 14629 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14630 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14631 if (NewMO.isReg()) 14632 NewMO.setIsKill(false); 14633 MIB.addOperand(NewMO); 14634 } 14635 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14636 unsigned flags = (*MMOI)->getFlags(); 14637 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14638 MachineMemOperand *MMO = 14639 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14640 (*MMOI)->getSize(), 14641 (*MMOI)->getBaseAlignment(), 14642 (*MMOI)->getTBAAInfo(), 14643 (*MMOI)->getRanges()); 14644 MIB.addMemOperand(MMO); 14645 }; 14646 MachineInstr *LowMI = MIB; 14647 14648 // Hi 14649 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); 14650 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14651 if (i == X86::AddrDisp) { 14652 MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) 14653 } else { 14654 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14655 if (NewMO.isReg()) 14656 NewMO.setIsKill(false); 14657 MIB.addOperand(NewMO); 14658 } 14659 } 14660 MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); 14661 14662 thisMBB->addSuccessor(mainMBB); 14663 14664 // mainMBB: 14665 MachineBasicBlock *origMainMBB = mainMBB; 14666 14667 // Add PHIs. 14668 MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) 14669 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14670 MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) 14671 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14672 14673 unsigned Opc = MI->getOpcode(); 14674 switch (Opc) { 14675 default: 14676 llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); 14677 case X86::ATOMAND6432: 14678 case X86::ATOMOR6432: 14679 case X86::ATOMXOR6432: 14680 case X86::ATOMADD6432: 14681 case X86::ATOMSUB6432: { 14682 unsigned HiOpc; 14683 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14684 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) 14685 .addReg(SrcLoReg); 14686 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) 14687 .addReg(SrcHiReg); 14688 break; 14689 } 14690 case X86::ATOMNAND6432: { 14691 unsigned HiOpc, NOTOpc; 14692 unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); 14693 unsigned TmpL = MRI.createVirtualRegister(RC); 14694 unsigned TmpH = MRI.createVirtualRegister(RC); 14695 BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) 14696 .addReg(t4L); 14697 BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) 14698 .addReg(t4H); 14699 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); 14700 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); 14701 break; 14702 } 14703 case X86::ATOMMAX6432: 14704 case X86::ATOMMIN6432: 14705 case X86::ATOMUMAX6432: 14706 case X86::ATOMUMIN6432: { 14707 unsigned HiOpc; 14708 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14709 unsigned cL = MRI.createVirtualRegister(RC8); 14710 unsigned cH = MRI.createVirtualRegister(RC8); 14711 unsigned cL32 = MRI.createVirtualRegister(RC); 14712 unsigned cH32 = MRI.createVirtualRegister(RC); 14713 unsigned cc = MRI.createVirtualRegister(RC); 14714 // cl := cmp src_lo, lo 14715 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14716 .addReg(SrcLoReg).addReg(t4L); 14717 BuildMI(mainMBB, DL, TII->get(LoOpc), cL); 14718 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); 14719 // ch := cmp src_hi, hi 14720 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14721 .addReg(SrcHiReg).addReg(t4H); 14722 BuildMI(mainMBB, DL, TII->get(HiOpc), cH); 14723 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); 14724 // cc := if (src_hi == hi) ? cl : ch; 14725 if (Subtarget->hasCMov()) { 14726 BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) 14727 .addReg(cH32).addReg(cL32); 14728 } else { 14729 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) 14730 .addReg(cH32).addReg(cL32) 14731 .addImm(X86::COND_E); 14732 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14733 } 14734 BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); 14735 if (Subtarget->hasCMov()) { 14736 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) 14737 .addReg(SrcLoReg).addReg(t4L); 14738 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) 14739 .addReg(SrcHiReg).addReg(t4H); 14740 } else { 14741 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) 14742 .addReg(SrcLoReg).addReg(t4L) 14743 .addImm(X86::COND_NE); 14744 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14745 // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the 14746 // 2nd CMOV lowering. 14747 mainMBB->addLiveIn(X86::EFLAGS); 14748 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) 14749 .addReg(SrcHiReg).addReg(t4H) 14750 .addImm(X86::COND_NE); 14751 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14752 // Replace the original PHI node as mainMBB is changed after CMOV 14753 // lowering. 14754 BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) 14755 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14756 BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) 14757 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14758 PhiL->eraseFromParent(); 14759 PhiH->eraseFromParent(); 14760 } 14761 break; 14762 } 14763 case X86::ATOMSWAP6432: { 14764 unsigned HiOpc; 14765 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14766 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); 14767 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); 14768 break; 14769 } 14770 } 14771 14772 // Copy EDX:EAX back from HiReg:LoReg 14773 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); 14774 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); 14775 // Copy ECX:EBX from t1H:t1L 14776 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); 14777 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); 14778 14779 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14780 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14781 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14782 if (NewMO.isReg()) 14783 NewMO.setIsKill(false); 14784 MIB.addOperand(NewMO); 14785 } 14786 MIB.setMemRefs(MMOBegin, MMOEnd); 14787 14788 // Copy EDX:EAX back to t3H:t3L 14789 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); 14790 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); 14791 14792 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14793 14794 mainMBB->addSuccessor(origMainMBB); 14795 mainMBB->addSuccessor(sinkMBB); 14796 14797 // sinkMBB: 14798 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14799 TII->get(TargetOpcode::COPY), DstLoReg) 14800 .addReg(t3L); 14801 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14802 TII->get(TargetOpcode::COPY), DstHiReg) 14803 .addReg(t3H); 14804 14805 MI->eraseFromParent(); 14806 return sinkMBB; 14807} 14808 14809// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 14810// or XMM0_V32I8 in AVX all of this code can be replaced with that 14811// in the .td file. 14812static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 14813 const TargetInstrInfo *TII) { 14814 unsigned Opc; 14815 switch (MI->getOpcode()) { 14816 default: llvm_unreachable("illegal opcode!"); 14817 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 14818 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 14819 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 14820 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 14821 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 14822 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 14823 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 14824 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 14825 } 14826 14827 DebugLoc dl = MI->getDebugLoc(); 14828 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14829 14830 unsigned NumArgs = MI->getNumOperands(); 14831 for (unsigned i = 1; i < NumArgs; ++i) { 14832 MachineOperand &Op = MI->getOperand(i); 14833 if (!(Op.isReg() && Op.isImplicit())) 14834 MIB.addOperand(Op); 14835 } 14836 if (MI->hasOneMemOperand()) 14837 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14838 14839 BuildMI(*BB, MI, dl, 14840 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14841 .addReg(X86::XMM0); 14842 14843 MI->eraseFromParent(); 14844 return BB; 14845} 14846 14847// FIXME: Custom handling because TableGen doesn't support multiple implicit 14848// defs in an instruction pattern 14849static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 14850 const TargetInstrInfo *TII) { 14851 unsigned Opc; 14852 switch (MI->getOpcode()) { 14853 default: llvm_unreachable("illegal opcode!"); 14854 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 14855 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 14856 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 14857 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 14858 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 14859 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 14860 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 14861 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 14862 } 14863 14864 DebugLoc dl = MI->getDebugLoc(); 14865 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14866 14867 unsigned NumArgs = MI->getNumOperands(); // remove the results 14868 for (unsigned i = 1; i < NumArgs; ++i) { 14869 MachineOperand &Op = MI->getOperand(i); 14870 if (!(Op.isReg() && Op.isImplicit())) 14871 MIB.addOperand(Op); 14872 } 14873 if (MI->hasOneMemOperand()) 14874 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14875 14876 BuildMI(*BB, MI, dl, 14877 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14878 .addReg(X86::ECX); 14879 14880 MI->eraseFromParent(); 14881 return BB; 14882} 14883 14884static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 14885 const TargetInstrInfo *TII, 14886 const X86Subtarget* Subtarget) { 14887 DebugLoc dl = MI->getDebugLoc(); 14888 14889 // Address into RAX/EAX, other two args into ECX, EDX. 14890 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 14891 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 14892 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 14893 for (int i = 0; i < X86::AddrNumOperands; ++i) 14894 MIB.addOperand(MI->getOperand(i)); 14895 14896 unsigned ValOps = X86::AddrNumOperands; 14897 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 14898 .addReg(MI->getOperand(ValOps).getReg()); 14899 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 14900 .addReg(MI->getOperand(ValOps+1).getReg()); 14901 14902 // The instruction doesn't actually take any operands though. 14903 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 14904 14905 MI->eraseFromParent(); // The pseudo is gone now. 14906 return BB; 14907} 14908 14909MachineBasicBlock * 14910X86TargetLowering::EmitVAARG64WithCustomInserter( 14911 MachineInstr *MI, 14912 MachineBasicBlock *MBB) const { 14913 // Emit va_arg instruction on X86-64. 14914 14915 // Operands to this pseudo-instruction: 14916 // 0 ) Output : destination address (reg) 14917 // 1-5) Input : va_list address (addr, i64mem) 14918 // 6 ) ArgSize : Size (in bytes) of vararg type 14919 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 14920 // 8 ) Align : Alignment of type 14921 // 9 ) EFLAGS (implicit-def) 14922 14923 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 14924 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 14925 14926 unsigned DestReg = MI->getOperand(0).getReg(); 14927 MachineOperand &Base = MI->getOperand(1); 14928 MachineOperand &Scale = MI->getOperand(2); 14929 MachineOperand &Index = MI->getOperand(3); 14930 MachineOperand &Disp = MI->getOperand(4); 14931 MachineOperand &Segment = MI->getOperand(5); 14932 unsigned ArgSize = MI->getOperand(6).getImm(); 14933 unsigned ArgMode = MI->getOperand(7).getImm(); 14934 unsigned Align = MI->getOperand(8).getImm(); 14935 14936 // Memory Reference 14937 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 14938 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14939 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14940 14941 // Machine Information 14942 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14943 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 14944 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 14945 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 14946 DebugLoc DL = MI->getDebugLoc(); 14947 14948 // struct va_list { 14949 // i32 gp_offset 14950 // i32 fp_offset 14951 // i64 overflow_area (address) 14952 // i64 reg_save_area (address) 14953 // } 14954 // sizeof(va_list) = 24 14955 // alignment(va_list) = 8 14956 14957 unsigned TotalNumIntRegs = 6; 14958 unsigned TotalNumXMMRegs = 8; 14959 bool UseGPOffset = (ArgMode == 1); 14960 bool UseFPOffset = (ArgMode == 2); 14961 unsigned MaxOffset = TotalNumIntRegs * 8 + 14962 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 14963 14964 /* Align ArgSize to a multiple of 8 */ 14965 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 14966 bool NeedsAlign = (Align > 8); 14967 14968 MachineBasicBlock *thisMBB = MBB; 14969 MachineBasicBlock *overflowMBB; 14970 MachineBasicBlock *offsetMBB; 14971 MachineBasicBlock *endMBB; 14972 14973 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 14974 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 14975 unsigned OffsetReg = 0; 14976 14977 if (!UseGPOffset && !UseFPOffset) { 14978 // If we only pull from the overflow region, we don't create a branch. 14979 // We don't need to alter control flow. 14980 OffsetDestReg = 0; // unused 14981 OverflowDestReg = DestReg; 14982 14983 offsetMBB = NULL; 14984 overflowMBB = thisMBB; 14985 endMBB = thisMBB; 14986 } else { 14987 // First emit code to check if gp_offset (or fp_offset) is below the bound. 14988 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 14989 // If not, pull from overflow_area. (branch to overflowMBB) 14990 // 14991 // thisMBB 14992 // | . 14993 // | . 14994 // offsetMBB overflowMBB 14995 // | . 14996 // | . 14997 // endMBB 14998 14999 // Registers for the PHI in endMBB 15000 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 15001 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 15002 15003 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 15004 MachineFunction *MF = MBB->getParent(); 15005 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15006 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15007 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15008 15009 MachineFunction::iterator MBBIter = MBB; 15010 ++MBBIter; 15011 15012 // Insert the new basic blocks 15013 MF->insert(MBBIter, offsetMBB); 15014 MF->insert(MBBIter, overflowMBB); 15015 MF->insert(MBBIter, endMBB); 15016 15017 // Transfer the remainder of MBB and its successor edges to endMBB. 15018 endMBB->splice(endMBB->begin(), thisMBB, 15019 llvm::next(MachineBasicBlock::iterator(MI)), 15020 thisMBB->end()); 15021 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 15022 15023 // Make offsetMBB and overflowMBB successors of thisMBB 15024 thisMBB->addSuccessor(offsetMBB); 15025 thisMBB->addSuccessor(overflowMBB); 15026 15027 // endMBB is a successor of both offsetMBB and overflowMBB 15028 offsetMBB->addSuccessor(endMBB); 15029 overflowMBB->addSuccessor(endMBB); 15030 15031 // Load the offset value into a register 15032 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15033 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 15034 .addOperand(Base) 15035 .addOperand(Scale) 15036 .addOperand(Index) 15037 .addDisp(Disp, UseFPOffset ? 4 : 0) 15038 .addOperand(Segment) 15039 .setMemRefs(MMOBegin, MMOEnd); 15040 15041 // Check if there is enough room left to pull this argument. 15042 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 15043 .addReg(OffsetReg) 15044 .addImm(MaxOffset + 8 - ArgSizeA8); 15045 15046 // Branch to "overflowMBB" if offset >= max 15047 // Fall through to "offsetMBB" otherwise 15048 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 15049 .addMBB(overflowMBB); 15050 } 15051 15052 // In offsetMBB, emit code to use the reg_save_area. 15053 if (offsetMBB) { 15054 assert(OffsetReg != 0); 15055 15056 // Read the reg_save_area address. 15057 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 15058 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 15059 .addOperand(Base) 15060 .addOperand(Scale) 15061 .addOperand(Index) 15062 .addDisp(Disp, 16) 15063 .addOperand(Segment) 15064 .setMemRefs(MMOBegin, MMOEnd); 15065 15066 // Zero-extend the offset 15067 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 15068 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 15069 .addImm(0) 15070 .addReg(OffsetReg) 15071 .addImm(X86::sub_32bit); 15072 15073 // Add the offset to the reg_save_area to get the final address. 15074 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 15075 .addReg(OffsetReg64) 15076 .addReg(RegSaveReg); 15077 15078 // Compute the offset for the next argument 15079 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15080 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 15081 .addReg(OffsetReg) 15082 .addImm(UseFPOffset ? 16 : 8); 15083 15084 // Store it back into the va_list. 15085 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 15086 .addOperand(Base) 15087 .addOperand(Scale) 15088 .addOperand(Index) 15089 .addDisp(Disp, UseFPOffset ? 4 : 0) 15090 .addOperand(Segment) 15091 .addReg(NextOffsetReg) 15092 .setMemRefs(MMOBegin, MMOEnd); 15093 15094 // Jump to endMBB 15095 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 15096 .addMBB(endMBB); 15097 } 15098 15099 // 15100 // Emit code to use overflow area 15101 // 15102 15103 // Load the overflow_area address into a register. 15104 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 15105 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 15106 .addOperand(Base) 15107 .addOperand(Scale) 15108 .addOperand(Index) 15109 .addDisp(Disp, 8) 15110 .addOperand(Segment) 15111 .setMemRefs(MMOBegin, MMOEnd); 15112 15113 // If we need to align it, do so. Otherwise, just copy the address 15114 // to OverflowDestReg. 15115 if (NeedsAlign) { 15116 // Align the overflow address 15117 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 15118 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 15119 15120 // aligned_addr = (addr + (align-1)) & ~(align-1) 15121 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 15122 .addReg(OverflowAddrReg) 15123 .addImm(Align-1); 15124 15125 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 15126 .addReg(TmpReg) 15127 .addImm(~(uint64_t)(Align-1)); 15128 } else { 15129 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 15130 .addReg(OverflowAddrReg); 15131 } 15132 15133 // Compute the next overflow address after this argument. 15134 // (the overflow address should be kept 8-byte aligned) 15135 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 15136 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 15137 .addReg(OverflowDestReg) 15138 .addImm(ArgSizeA8); 15139 15140 // Store the new overflow address. 15141 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 15142 .addOperand(Base) 15143 .addOperand(Scale) 15144 .addOperand(Index) 15145 .addDisp(Disp, 8) 15146 .addOperand(Segment) 15147 .addReg(NextAddrReg) 15148 .setMemRefs(MMOBegin, MMOEnd); 15149 15150 // If we branched, emit the PHI to the front of endMBB. 15151 if (offsetMBB) { 15152 BuildMI(*endMBB, endMBB->begin(), DL, 15153 TII->get(X86::PHI), DestReg) 15154 .addReg(OffsetDestReg).addMBB(offsetMBB) 15155 .addReg(OverflowDestReg).addMBB(overflowMBB); 15156 } 15157 15158 // Erase the pseudo instruction 15159 MI->eraseFromParent(); 15160 15161 return endMBB; 15162} 15163 15164MachineBasicBlock * 15165X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 15166 MachineInstr *MI, 15167 MachineBasicBlock *MBB) const { 15168 // Emit code to save XMM registers to the stack. The ABI says that the 15169 // number of registers to save is given in %al, so it's theoretically 15170 // possible to do an indirect jump trick to avoid saving all of them, 15171 // however this code takes a simpler approach and just executes all 15172 // of the stores if %al is non-zero. It's less code, and it's probably 15173 // easier on the hardware branch predictor, and stores aren't all that 15174 // expensive anyway. 15175 15176 // Create the new basic blocks. One block contains all the XMM stores, 15177 // and one block is the final destination regardless of whether any 15178 // stores were performed. 15179 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 15180 MachineFunction *F = MBB->getParent(); 15181 MachineFunction::iterator MBBIter = MBB; 15182 ++MBBIter; 15183 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 15184 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 15185 F->insert(MBBIter, XMMSaveMBB); 15186 F->insert(MBBIter, EndMBB); 15187 15188 // Transfer the remainder of MBB and its successor edges to EndMBB. 15189 EndMBB->splice(EndMBB->begin(), MBB, 15190 llvm::next(MachineBasicBlock::iterator(MI)), 15191 MBB->end()); 15192 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 15193 15194 // The original block will now fall through to the XMM save block. 15195 MBB->addSuccessor(XMMSaveMBB); 15196 // The XMMSaveMBB will fall through to the end block. 15197 XMMSaveMBB->addSuccessor(EndMBB); 15198 15199 // Now add the instructions. 15200 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15201 DebugLoc DL = MI->getDebugLoc(); 15202 15203 unsigned CountReg = MI->getOperand(0).getReg(); 15204 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 15205 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 15206 15207 if (!Subtarget->isTargetWin64()) { 15208 // If %al is 0, branch around the XMM save block. 15209 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 15210 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 15211 MBB->addSuccessor(EndMBB); 15212 } 15213 15214 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 15215 // In the XMM save block, save all the XMM argument registers. 15216 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 15217 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 15218 MachineMemOperand *MMO = 15219 F->getMachineMemOperand( 15220 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 15221 MachineMemOperand::MOStore, 15222 /*Size=*/16, /*Align=*/16); 15223 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 15224 .addFrameIndex(RegSaveFrameIndex) 15225 .addImm(/*Scale=*/1) 15226 .addReg(/*IndexReg=*/0) 15227 .addImm(/*Disp=*/Offset) 15228 .addReg(/*Segment=*/0) 15229 .addReg(MI->getOperand(i).getReg()) 15230 .addMemOperand(MMO); 15231 } 15232 15233 MI->eraseFromParent(); // The pseudo instruction is gone now. 15234 15235 return EndMBB; 15236} 15237 15238// The EFLAGS operand of SelectItr might be missing a kill marker 15239// because there were multiple uses of EFLAGS, and ISel didn't know 15240// which to mark. Figure out whether SelectItr should have had a 15241// kill marker, and set it if it should. Returns the correct kill 15242// marker value. 15243static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 15244 MachineBasicBlock* BB, 15245 const TargetRegisterInfo* TRI) { 15246 // Scan forward through BB for a use/def of EFLAGS. 15247 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 15248 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 15249 const MachineInstr& mi = *miI; 15250 if (mi.readsRegister(X86::EFLAGS)) 15251 return false; 15252 if (mi.definesRegister(X86::EFLAGS)) 15253 break; // Should have kill-flag - update below. 15254 } 15255 15256 // If we hit the end of the block, check whether EFLAGS is live into a 15257 // successor. 15258 if (miI == BB->end()) { 15259 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 15260 sEnd = BB->succ_end(); 15261 sItr != sEnd; ++sItr) { 15262 MachineBasicBlock* succ = *sItr; 15263 if (succ->isLiveIn(X86::EFLAGS)) 15264 return false; 15265 } 15266 } 15267 15268 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 15269 // out. SelectMI should have a kill flag on EFLAGS. 15270 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 15271 return true; 15272} 15273 15274MachineBasicBlock * 15275X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 15276 MachineBasicBlock *BB) const { 15277 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15278 DebugLoc DL = MI->getDebugLoc(); 15279 15280 // To "insert" a SELECT_CC instruction, we actually have to insert the 15281 // diamond control-flow pattern. The incoming instruction knows the 15282 // destination vreg to set, the condition code register to branch on, the 15283 // true/false values to select between, and a branch opcode to use. 15284 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15285 MachineFunction::iterator It = BB; 15286 ++It; 15287 15288 // thisMBB: 15289 // ... 15290 // TrueVal = ... 15291 // cmpTY ccX, r1, r2 15292 // bCC copy1MBB 15293 // fallthrough --> copy0MBB 15294 MachineBasicBlock *thisMBB = BB; 15295 MachineFunction *F = BB->getParent(); 15296 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 15297 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 15298 F->insert(It, copy0MBB); 15299 F->insert(It, sinkMBB); 15300 15301 // If the EFLAGS register isn't dead in the terminator, then claim that it's 15302 // live into the sink and copy blocks. 15303 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 15304 if (!MI->killsRegister(X86::EFLAGS) && 15305 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 15306 copy0MBB->addLiveIn(X86::EFLAGS); 15307 sinkMBB->addLiveIn(X86::EFLAGS); 15308 } 15309 15310 // Transfer the remainder of BB and its successor edges to sinkMBB. 15311 sinkMBB->splice(sinkMBB->begin(), BB, 15312 llvm::next(MachineBasicBlock::iterator(MI)), 15313 BB->end()); 15314 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 15315 15316 // Add the true and fallthrough blocks as its successors. 15317 BB->addSuccessor(copy0MBB); 15318 BB->addSuccessor(sinkMBB); 15319 15320 // Create the conditional branch instruction. 15321 unsigned Opc = 15322 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 15323 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 15324 15325 // copy0MBB: 15326 // %FalseValue = ... 15327 // # fallthrough to sinkMBB 15328 copy0MBB->addSuccessor(sinkMBB); 15329 15330 // sinkMBB: 15331 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 15332 // ... 15333 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15334 TII->get(X86::PHI), MI->getOperand(0).getReg()) 15335 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 15336 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 15337 15338 MI->eraseFromParent(); // The pseudo instruction is gone now. 15339 return sinkMBB; 15340} 15341 15342MachineBasicBlock * 15343X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 15344 bool Is64Bit) const { 15345 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15346 DebugLoc DL = MI->getDebugLoc(); 15347 MachineFunction *MF = BB->getParent(); 15348 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15349 15350 assert(getTargetMachine().Options.EnableSegmentedStacks); 15351 15352 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 15353 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 15354 15355 // BB: 15356 // ... [Till the alloca] 15357 // If stacklet is not large enough, jump to mallocMBB 15358 // 15359 // bumpMBB: 15360 // Allocate by subtracting from RSP 15361 // Jump to continueMBB 15362 // 15363 // mallocMBB: 15364 // Allocate by call to runtime 15365 // 15366 // continueMBB: 15367 // ... 15368 // [rest of original BB] 15369 // 15370 15371 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15372 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15373 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15374 15375 MachineRegisterInfo &MRI = MF->getRegInfo(); 15376 const TargetRegisterClass *AddrRegClass = 15377 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 15378 15379 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15380 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15381 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 15382 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 15383 sizeVReg = MI->getOperand(1).getReg(), 15384 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 15385 15386 MachineFunction::iterator MBBIter = BB; 15387 ++MBBIter; 15388 15389 MF->insert(MBBIter, bumpMBB); 15390 MF->insert(MBBIter, mallocMBB); 15391 MF->insert(MBBIter, continueMBB); 15392 15393 continueMBB->splice(continueMBB->begin(), BB, llvm::next 15394 (MachineBasicBlock::iterator(MI)), BB->end()); 15395 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 15396 15397 // Add code to the main basic block to check if the stack limit has been hit, 15398 // and if so, jump to mallocMBB otherwise to bumpMBB. 15399 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 15400 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 15401 .addReg(tmpSPVReg).addReg(sizeVReg); 15402 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 15403 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 15404 .addReg(SPLimitVReg); 15405 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 15406 15407 // bumpMBB simply decreases the stack pointer, since we know the current 15408 // stacklet has enough space. 15409 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 15410 .addReg(SPLimitVReg); 15411 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 15412 .addReg(SPLimitVReg); 15413 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15414 15415 // Calls into a routine in libgcc to allocate more space from the heap. 15416 const uint32_t *RegMask = 15417 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15418 if (Is64Bit) { 15419 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 15420 .addReg(sizeVReg); 15421 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 15422 .addExternalSymbol("__morestack_allocate_stack_space") 15423 .addRegMask(RegMask) 15424 .addReg(X86::RDI, RegState::Implicit) 15425 .addReg(X86::RAX, RegState::ImplicitDefine); 15426 } else { 15427 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 15428 .addImm(12); 15429 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 15430 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 15431 .addExternalSymbol("__morestack_allocate_stack_space") 15432 .addRegMask(RegMask) 15433 .addReg(X86::EAX, RegState::ImplicitDefine); 15434 } 15435 15436 if (!Is64Bit) 15437 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 15438 .addImm(16); 15439 15440 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 15441 .addReg(Is64Bit ? X86::RAX : X86::EAX); 15442 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15443 15444 // Set up the CFG correctly. 15445 BB->addSuccessor(bumpMBB); 15446 BB->addSuccessor(mallocMBB); 15447 mallocMBB->addSuccessor(continueMBB); 15448 bumpMBB->addSuccessor(continueMBB); 15449 15450 // Take care of the PHI nodes. 15451 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 15452 MI->getOperand(0).getReg()) 15453 .addReg(mallocPtrVReg).addMBB(mallocMBB) 15454 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 15455 15456 // Delete the original pseudo instruction. 15457 MI->eraseFromParent(); 15458 15459 // And we're done. 15460 return continueMBB; 15461} 15462 15463MachineBasicBlock * 15464X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 15465 MachineBasicBlock *BB) const { 15466 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15467 DebugLoc DL = MI->getDebugLoc(); 15468 15469 assert(!Subtarget->isTargetEnvMacho()); 15470 15471 // The lowering is pretty easy: we're just emitting the call to _alloca. The 15472 // non-trivial part is impdef of ESP. 15473 15474 if (Subtarget->isTargetWin64()) { 15475 if (Subtarget->isTargetCygMing()) { 15476 // ___chkstk(Mingw64): 15477 // Clobbers R10, R11, RAX and EFLAGS. 15478 // Updates RSP. 15479 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15480 .addExternalSymbol("___chkstk") 15481 .addReg(X86::RAX, RegState::Implicit) 15482 .addReg(X86::RSP, RegState::Implicit) 15483 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 15484 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 15485 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15486 } else { 15487 // __chkstk(MSVCRT): does not update stack pointer. 15488 // Clobbers R10, R11 and EFLAGS. 15489 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15490 .addExternalSymbol("__chkstk") 15491 .addReg(X86::RAX, RegState::Implicit) 15492 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15493 // RAX has the offset to be subtracted from RSP. 15494 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 15495 .addReg(X86::RSP) 15496 .addReg(X86::RAX); 15497 } 15498 } else { 15499 const char *StackProbeSymbol = 15500 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 15501 15502 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 15503 .addExternalSymbol(StackProbeSymbol) 15504 .addReg(X86::EAX, RegState::Implicit) 15505 .addReg(X86::ESP, RegState::Implicit) 15506 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 15507 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 15508 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15509 } 15510 15511 MI->eraseFromParent(); // The pseudo instruction is gone now. 15512 return BB; 15513} 15514 15515MachineBasicBlock * 15516X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 15517 MachineBasicBlock *BB) const { 15518 // This is pretty easy. We're taking the value that we received from 15519 // our load from the relocation, sticking it in either RDI (x86-64) 15520 // or EAX and doing an indirect call. The return value will then 15521 // be in the normal return register. 15522 const X86InstrInfo *TII 15523 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 15524 DebugLoc DL = MI->getDebugLoc(); 15525 MachineFunction *F = BB->getParent(); 15526 15527 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 15528 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 15529 15530 // Get a register mask for the lowered call. 15531 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 15532 // proper register mask. 15533 const uint32_t *RegMask = 15534 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15535 if (Subtarget->is64Bit()) { 15536 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15537 TII->get(X86::MOV64rm), X86::RDI) 15538 .addReg(X86::RIP) 15539 .addImm(0).addReg(0) 15540 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15541 MI->getOperand(3).getTargetFlags()) 15542 .addReg(0); 15543 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 15544 addDirectMem(MIB, X86::RDI); 15545 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 15546 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 15547 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15548 TII->get(X86::MOV32rm), X86::EAX) 15549 .addReg(0) 15550 .addImm(0).addReg(0) 15551 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15552 MI->getOperand(3).getTargetFlags()) 15553 .addReg(0); 15554 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15555 addDirectMem(MIB, X86::EAX); 15556 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15557 } else { 15558 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15559 TII->get(X86::MOV32rm), X86::EAX) 15560 .addReg(TII->getGlobalBaseReg(F)) 15561 .addImm(0).addReg(0) 15562 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15563 MI->getOperand(3).getTargetFlags()) 15564 .addReg(0); 15565 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15566 addDirectMem(MIB, X86::EAX); 15567 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15568 } 15569 15570 MI->eraseFromParent(); // The pseudo instruction is gone now. 15571 return BB; 15572} 15573 15574MachineBasicBlock * 15575X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 15576 MachineBasicBlock *MBB) const { 15577 DebugLoc DL = MI->getDebugLoc(); 15578 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15579 15580 MachineFunction *MF = MBB->getParent(); 15581 MachineRegisterInfo &MRI = MF->getRegInfo(); 15582 15583 const BasicBlock *BB = MBB->getBasicBlock(); 15584 MachineFunction::iterator I = MBB; 15585 ++I; 15586 15587 // Memory Reference 15588 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15589 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15590 15591 unsigned DstReg; 15592 unsigned MemOpndSlot = 0; 15593 15594 unsigned CurOp = 0; 15595 15596 DstReg = MI->getOperand(CurOp++).getReg(); 15597 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 15598 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 15599 unsigned mainDstReg = MRI.createVirtualRegister(RC); 15600 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 15601 15602 MemOpndSlot = CurOp; 15603 15604 MVT PVT = getPointerTy(); 15605 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15606 "Invalid Pointer Size!"); 15607 15608 // For v = setjmp(buf), we generate 15609 // 15610 // thisMBB: 15611 // buf[LabelOffset] = restoreMBB 15612 // SjLjSetup restoreMBB 15613 // 15614 // mainMBB: 15615 // v_main = 0 15616 // 15617 // sinkMBB: 15618 // v = phi(main, restore) 15619 // 15620 // restoreMBB: 15621 // v_restore = 1 15622 15623 MachineBasicBlock *thisMBB = MBB; 15624 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 15625 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 15626 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 15627 MF->insert(I, mainMBB); 15628 MF->insert(I, sinkMBB); 15629 MF->push_back(restoreMBB); 15630 15631 MachineInstrBuilder MIB; 15632 15633 // Transfer the remainder of BB and its successor edges to sinkMBB. 15634 sinkMBB->splice(sinkMBB->begin(), MBB, 15635 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 15636 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 15637 15638 // thisMBB: 15639 unsigned PtrStoreOpc = 0; 15640 unsigned LabelReg = 0; 15641 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15642 Reloc::Model RM = getTargetMachine().getRelocationModel(); 15643 bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && 15644 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 15645 15646 // Prepare IP either in reg or imm. 15647 if (!UseImmLabel) { 15648 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 15649 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 15650 LabelReg = MRI.createVirtualRegister(PtrRC); 15651 if (Subtarget->is64Bit()) { 15652 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 15653 .addReg(X86::RIP) 15654 .addImm(0) 15655 .addReg(0) 15656 .addMBB(restoreMBB) 15657 .addReg(0); 15658 } else { 15659 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 15660 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 15661 .addReg(XII->getGlobalBaseReg(MF)) 15662 .addImm(0) 15663 .addReg(0) 15664 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 15665 .addReg(0); 15666 } 15667 } else 15668 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 15669 // Store IP 15670 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 15671 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15672 if (i == X86::AddrDisp) 15673 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 15674 else 15675 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 15676 } 15677 if (!UseImmLabel) 15678 MIB.addReg(LabelReg); 15679 else 15680 MIB.addMBB(restoreMBB); 15681 MIB.setMemRefs(MMOBegin, MMOEnd); 15682 // Setup 15683 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 15684 .addMBB(restoreMBB); 15685 15686 const X86RegisterInfo *RegInfo = 15687 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15688 MIB.addRegMask(RegInfo->getNoPreservedMask()); 15689 thisMBB->addSuccessor(mainMBB); 15690 thisMBB->addSuccessor(restoreMBB); 15691 15692 // mainMBB: 15693 // EAX = 0 15694 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 15695 mainMBB->addSuccessor(sinkMBB); 15696 15697 // sinkMBB: 15698 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15699 TII->get(X86::PHI), DstReg) 15700 .addReg(mainDstReg).addMBB(mainMBB) 15701 .addReg(restoreDstReg).addMBB(restoreMBB); 15702 15703 // restoreMBB: 15704 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 15705 BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); 15706 restoreMBB->addSuccessor(sinkMBB); 15707 15708 MI->eraseFromParent(); 15709 return sinkMBB; 15710} 15711 15712MachineBasicBlock * 15713X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 15714 MachineBasicBlock *MBB) const { 15715 DebugLoc DL = MI->getDebugLoc(); 15716 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15717 15718 MachineFunction *MF = MBB->getParent(); 15719 MachineRegisterInfo &MRI = MF->getRegInfo(); 15720 15721 // Memory Reference 15722 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15723 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15724 15725 MVT PVT = getPointerTy(); 15726 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15727 "Invalid Pointer Size!"); 15728 15729 const TargetRegisterClass *RC = 15730 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 15731 unsigned Tmp = MRI.createVirtualRegister(RC); 15732 // Since FP is only updated here but NOT referenced, it's treated as GPR. 15733 const X86RegisterInfo *RegInfo = 15734 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15735 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 15736 unsigned SP = RegInfo->getStackRegister(); 15737 15738 MachineInstrBuilder MIB; 15739 15740 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15741 const int64_t SPOffset = 2 * PVT.getStoreSize(); 15742 15743 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 15744 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 15745 15746 // Reload FP 15747 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 15748 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 15749 MIB.addOperand(MI->getOperand(i)); 15750 MIB.setMemRefs(MMOBegin, MMOEnd); 15751 // Reload IP 15752 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 15753 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15754 if (i == X86::AddrDisp) 15755 MIB.addDisp(MI->getOperand(i), LabelOffset); 15756 else 15757 MIB.addOperand(MI->getOperand(i)); 15758 } 15759 MIB.setMemRefs(MMOBegin, MMOEnd); 15760 // Reload SP 15761 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 15762 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15763 if (i == X86::AddrDisp) 15764 MIB.addDisp(MI->getOperand(i), SPOffset); 15765 else 15766 MIB.addOperand(MI->getOperand(i)); 15767 } 15768 MIB.setMemRefs(MMOBegin, MMOEnd); 15769 // Jump 15770 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 15771 15772 MI->eraseFromParent(); 15773 return MBB; 15774} 15775 15776MachineBasicBlock * 15777X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 15778 MachineBasicBlock *BB) const { 15779 switch (MI->getOpcode()) { 15780 default: llvm_unreachable("Unexpected instr type to insert"); 15781 case X86::TAILJMPd64: 15782 case X86::TAILJMPr64: 15783 case X86::TAILJMPm64: 15784 llvm_unreachable("TAILJMP64 would not be touched here."); 15785 case X86::TCRETURNdi64: 15786 case X86::TCRETURNri64: 15787 case X86::TCRETURNmi64: 15788 return BB; 15789 case X86::WIN_ALLOCA: 15790 return EmitLoweredWinAlloca(MI, BB); 15791 case X86::SEG_ALLOCA_32: 15792 return EmitLoweredSegAlloca(MI, BB, false); 15793 case X86::SEG_ALLOCA_64: 15794 return EmitLoweredSegAlloca(MI, BB, true); 15795 case X86::TLSCall_32: 15796 case X86::TLSCall_64: 15797 return EmitLoweredTLSCall(MI, BB); 15798 case X86::CMOV_GR8: 15799 case X86::CMOV_FR32: 15800 case X86::CMOV_FR64: 15801 case X86::CMOV_V4F32: 15802 case X86::CMOV_V2F64: 15803 case X86::CMOV_V2I64: 15804 case X86::CMOV_V8F32: 15805 case X86::CMOV_V4F64: 15806 case X86::CMOV_V4I64: 15807 case X86::CMOV_V16F32: 15808 case X86::CMOV_V8F64: 15809 case X86::CMOV_V8I64: 15810 case X86::CMOV_GR16: 15811 case X86::CMOV_GR32: 15812 case X86::CMOV_RFP32: 15813 case X86::CMOV_RFP64: 15814 case X86::CMOV_RFP80: 15815 return EmitLoweredSelect(MI, BB); 15816 15817 case X86::FP32_TO_INT16_IN_MEM: 15818 case X86::FP32_TO_INT32_IN_MEM: 15819 case X86::FP32_TO_INT64_IN_MEM: 15820 case X86::FP64_TO_INT16_IN_MEM: 15821 case X86::FP64_TO_INT32_IN_MEM: 15822 case X86::FP64_TO_INT64_IN_MEM: 15823 case X86::FP80_TO_INT16_IN_MEM: 15824 case X86::FP80_TO_INT32_IN_MEM: 15825 case X86::FP80_TO_INT64_IN_MEM: { 15826 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15827 DebugLoc DL = MI->getDebugLoc(); 15828 15829 // Change the floating point control register to use "round towards zero" 15830 // mode when truncating to an integer value. 15831 MachineFunction *F = BB->getParent(); 15832 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 15833 addFrameReference(BuildMI(*BB, MI, DL, 15834 TII->get(X86::FNSTCW16m)), CWFrameIdx); 15835 15836 // Load the old value of the high byte of the control word... 15837 unsigned OldCW = 15838 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 15839 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 15840 CWFrameIdx); 15841 15842 // Set the high part to be round to zero... 15843 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 15844 .addImm(0xC7F); 15845 15846 // Reload the modified control word now... 15847 addFrameReference(BuildMI(*BB, MI, DL, 15848 TII->get(X86::FLDCW16m)), CWFrameIdx); 15849 15850 // Restore the memory image of control word to original value 15851 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 15852 .addReg(OldCW); 15853 15854 // Get the X86 opcode to use. 15855 unsigned Opc; 15856 switch (MI->getOpcode()) { 15857 default: llvm_unreachable("illegal opcode!"); 15858 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 15859 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 15860 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 15861 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 15862 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 15863 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 15864 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 15865 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 15866 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 15867 } 15868 15869 X86AddressMode AM; 15870 MachineOperand &Op = MI->getOperand(0); 15871 if (Op.isReg()) { 15872 AM.BaseType = X86AddressMode::RegBase; 15873 AM.Base.Reg = Op.getReg(); 15874 } else { 15875 AM.BaseType = X86AddressMode::FrameIndexBase; 15876 AM.Base.FrameIndex = Op.getIndex(); 15877 } 15878 Op = MI->getOperand(1); 15879 if (Op.isImm()) 15880 AM.Scale = Op.getImm(); 15881 Op = MI->getOperand(2); 15882 if (Op.isImm()) 15883 AM.IndexReg = Op.getImm(); 15884 Op = MI->getOperand(3); 15885 if (Op.isGlobal()) { 15886 AM.GV = Op.getGlobal(); 15887 } else { 15888 AM.Disp = Op.getImm(); 15889 } 15890 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 15891 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 15892 15893 // Reload the original control word now. 15894 addFrameReference(BuildMI(*BB, MI, DL, 15895 TII->get(X86::FLDCW16m)), CWFrameIdx); 15896 15897 MI->eraseFromParent(); // The pseudo instruction is gone now. 15898 return BB; 15899 } 15900 // String/text processing lowering. 15901 case X86::PCMPISTRM128REG: 15902 case X86::VPCMPISTRM128REG: 15903 case X86::PCMPISTRM128MEM: 15904 case X86::VPCMPISTRM128MEM: 15905 case X86::PCMPESTRM128REG: 15906 case X86::VPCMPESTRM128REG: 15907 case X86::PCMPESTRM128MEM: 15908 case X86::VPCMPESTRM128MEM: 15909 assert(Subtarget->hasSSE42() && 15910 "Target must have SSE4.2 or AVX features enabled"); 15911 return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); 15912 15913 // String/text processing lowering. 15914 case X86::PCMPISTRIREG: 15915 case X86::VPCMPISTRIREG: 15916 case X86::PCMPISTRIMEM: 15917 case X86::VPCMPISTRIMEM: 15918 case X86::PCMPESTRIREG: 15919 case X86::VPCMPESTRIREG: 15920 case X86::PCMPESTRIMEM: 15921 case X86::VPCMPESTRIMEM: 15922 assert(Subtarget->hasSSE42() && 15923 "Target must have SSE4.2 or AVX features enabled"); 15924 return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); 15925 15926 // Thread synchronization. 15927 case X86::MONITOR: 15928 return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); 15929 15930 // xbegin 15931 case X86::XBEGIN: 15932 return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); 15933 15934 // Atomic Lowering. 15935 case X86::ATOMAND8: 15936 case X86::ATOMAND16: 15937 case X86::ATOMAND32: 15938 case X86::ATOMAND64: 15939 // Fall through 15940 case X86::ATOMOR8: 15941 case X86::ATOMOR16: 15942 case X86::ATOMOR32: 15943 case X86::ATOMOR64: 15944 // Fall through 15945 case X86::ATOMXOR16: 15946 case X86::ATOMXOR8: 15947 case X86::ATOMXOR32: 15948 case X86::ATOMXOR64: 15949 // Fall through 15950 case X86::ATOMNAND8: 15951 case X86::ATOMNAND16: 15952 case X86::ATOMNAND32: 15953 case X86::ATOMNAND64: 15954 // Fall through 15955 case X86::ATOMMAX8: 15956 case X86::ATOMMAX16: 15957 case X86::ATOMMAX32: 15958 case X86::ATOMMAX64: 15959 // Fall through 15960 case X86::ATOMMIN8: 15961 case X86::ATOMMIN16: 15962 case X86::ATOMMIN32: 15963 case X86::ATOMMIN64: 15964 // Fall through 15965 case X86::ATOMUMAX8: 15966 case X86::ATOMUMAX16: 15967 case X86::ATOMUMAX32: 15968 case X86::ATOMUMAX64: 15969 // Fall through 15970 case X86::ATOMUMIN8: 15971 case X86::ATOMUMIN16: 15972 case X86::ATOMUMIN32: 15973 case X86::ATOMUMIN64: 15974 return EmitAtomicLoadArith(MI, BB); 15975 15976 // This group does 64-bit operations on a 32-bit host. 15977 case X86::ATOMAND6432: 15978 case X86::ATOMOR6432: 15979 case X86::ATOMXOR6432: 15980 case X86::ATOMNAND6432: 15981 case X86::ATOMADD6432: 15982 case X86::ATOMSUB6432: 15983 case X86::ATOMMAX6432: 15984 case X86::ATOMMIN6432: 15985 case X86::ATOMUMAX6432: 15986 case X86::ATOMUMIN6432: 15987 case X86::ATOMSWAP6432: 15988 return EmitAtomicLoadArith6432(MI, BB); 15989 15990 case X86::VASTART_SAVE_XMM_REGS: 15991 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 15992 15993 case X86::VAARG_64: 15994 return EmitVAARG64WithCustomInserter(MI, BB); 15995 15996 case X86::EH_SjLj_SetJmp32: 15997 case X86::EH_SjLj_SetJmp64: 15998 return emitEHSjLjSetJmp(MI, BB); 15999 16000 case X86::EH_SjLj_LongJmp32: 16001 case X86::EH_SjLj_LongJmp64: 16002 return emitEHSjLjLongJmp(MI, BB); 16003 } 16004} 16005 16006//===----------------------------------------------------------------------===// 16007// X86 Optimization Hooks 16008//===----------------------------------------------------------------------===// 16009 16010void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 16011 APInt &KnownZero, 16012 APInt &KnownOne, 16013 const SelectionDAG &DAG, 16014 unsigned Depth) const { 16015 unsigned BitWidth = KnownZero.getBitWidth(); 16016 unsigned Opc = Op.getOpcode(); 16017 assert((Opc >= ISD::BUILTIN_OP_END || 16018 Opc == ISD::INTRINSIC_WO_CHAIN || 16019 Opc == ISD::INTRINSIC_W_CHAIN || 16020 Opc == ISD::INTRINSIC_VOID) && 16021 "Should use MaskedValueIsZero if you don't know whether Op" 16022 " is a target node!"); 16023 16024 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 16025 switch (Opc) { 16026 default: break; 16027 case X86ISD::ADD: 16028 case X86ISD::SUB: 16029 case X86ISD::ADC: 16030 case X86ISD::SBB: 16031 case X86ISD::SMUL: 16032 case X86ISD::UMUL: 16033 case X86ISD::INC: 16034 case X86ISD::DEC: 16035 case X86ISD::OR: 16036 case X86ISD::XOR: 16037 case X86ISD::AND: 16038 // These nodes' second result is a boolean. 16039 if (Op.getResNo() == 0) 16040 break; 16041 // Fallthrough 16042 case X86ISD::SETCC: 16043 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 16044 break; 16045 case ISD::INTRINSIC_WO_CHAIN: { 16046 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16047 unsigned NumLoBits = 0; 16048 switch (IntId) { 16049 default: break; 16050 case Intrinsic::x86_sse_movmsk_ps: 16051 case Intrinsic::x86_avx_movmsk_ps_256: 16052 case Intrinsic::x86_sse2_movmsk_pd: 16053 case Intrinsic::x86_avx_movmsk_pd_256: 16054 case Intrinsic::x86_mmx_pmovmskb: 16055 case Intrinsic::x86_sse2_pmovmskb_128: 16056 case Intrinsic::x86_avx2_pmovmskb: { 16057 // High bits of movmskp{s|d}, pmovmskb are known zero. 16058 switch (IntId) { 16059 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 16060 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 16061 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 16062 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 16063 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 16064 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 16065 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 16066 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 16067 } 16068 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 16069 break; 16070 } 16071 } 16072 break; 16073 } 16074 } 16075} 16076 16077unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 16078 unsigned Depth) const { 16079 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 16080 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 16081 return Op.getValueType().getScalarType().getSizeInBits(); 16082 16083 // Fallback case. 16084 return 1; 16085} 16086 16087/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 16088/// node is a GlobalAddress + offset. 16089bool X86TargetLowering::isGAPlusOffset(SDNode *N, 16090 const GlobalValue* &GA, 16091 int64_t &Offset) const { 16092 if (N->getOpcode() == X86ISD::Wrapper) { 16093 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 16094 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 16095 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 16096 return true; 16097 } 16098 } 16099 return TargetLowering::isGAPlusOffset(N, GA, Offset); 16100} 16101 16102/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 16103/// same as extracting the high 128-bit part of 256-bit vector and then 16104/// inserting the result into the low part of a new 256-bit vector 16105static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 16106 EVT VT = SVOp->getValueType(0); 16107 unsigned NumElems = VT.getVectorNumElements(); 16108 16109 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16110 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 16111 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16112 SVOp->getMaskElt(j) >= 0) 16113 return false; 16114 16115 return true; 16116} 16117 16118/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 16119/// same as extracting the low 128-bit part of 256-bit vector and then 16120/// inserting the result into the high part of a new 256-bit vector 16121static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 16122 EVT VT = SVOp->getValueType(0); 16123 unsigned NumElems = VT.getVectorNumElements(); 16124 16125 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16126 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 16127 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16128 SVOp->getMaskElt(j) >= 0) 16129 return false; 16130 16131 return true; 16132} 16133 16134/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 16135static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 16136 TargetLowering::DAGCombinerInfo &DCI, 16137 const X86Subtarget* Subtarget) { 16138 SDLoc dl(N); 16139 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 16140 SDValue V1 = SVOp->getOperand(0); 16141 SDValue V2 = SVOp->getOperand(1); 16142 EVT VT = SVOp->getValueType(0); 16143 unsigned NumElems = VT.getVectorNumElements(); 16144 16145 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 16146 V2.getOpcode() == ISD::CONCAT_VECTORS) { 16147 // 16148 // 0,0,0,... 16149 // | 16150 // V UNDEF BUILD_VECTOR UNDEF 16151 // \ / \ / 16152 // CONCAT_VECTOR CONCAT_VECTOR 16153 // \ / 16154 // \ / 16155 // RESULT: V + zero extended 16156 // 16157 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 16158 V2.getOperand(1).getOpcode() != ISD::UNDEF || 16159 V1.getOperand(1).getOpcode() != ISD::UNDEF) 16160 return SDValue(); 16161 16162 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 16163 return SDValue(); 16164 16165 // To match the shuffle mask, the first half of the mask should 16166 // be exactly the first vector, and all the rest a splat with the 16167 // first element of the second one. 16168 for (unsigned i = 0; i != NumElems/2; ++i) 16169 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 16170 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 16171 return SDValue(); 16172 16173 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 16174 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 16175 if (Ld->hasNUsesOfValue(1, 0)) { 16176 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 16177 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 16178 SDValue ResNode = 16179 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 16180 array_lengthof(Ops), 16181 Ld->getMemoryVT(), 16182 Ld->getPointerInfo(), 16183 Ld->getAlignment(), 16184 false/*isVolatile*/, true/*ReadMem*/, 16185 false/*WriteMem*/); 16186 16187 // Make sure the newly-created LOAD is in the same position as Ld in 16188 // terms of dependency. We create a TokenFactor for Ld and ResNode, 16189 // and update uses of Ld's output chain to use the TokenFactor. 16190 if (Ld->hasAnyUseOfValue(1)) { 16191 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 16192 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 16193 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 16194 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 16195 SDValue(ResNode.getNode(), 1)); 16196 } 16197 16198 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 16199 } 16200 } 16201 16202 // Emit a zeroed vector and insert the desired subvector on its 16203 // first half. 16204 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16205 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 16206 return DCI.CombineTo(N, InsV); 16207 } 16208 16209 //===--------------------------------------------------------------------===// 16210 // Combine some shuffles into subvector extracts and inserts: 16211 // 16212 16213 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16214 if (isShuffleHigh128VectorInsertLow(SVOp)) { 16215 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 16216 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 16217 return DCI.CombineTo(N, InsV); 16218 } 16219 16220 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16221 if (isShuffleLow128VectorInsertHigh(SVOp)) { 16222 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 16223 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 16224 return DCI.CombineTo(N, InsV); 16225 } 16226 16227 return SDValue(); 16228} 16229 16230/// PerformShuffleCombine - Performs several different shuffle combines. 16231static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 16232 TargetLowering::DAGCombinerInfo &DCI, 16233 const X86Subtarget *Subtarget) { 16234 SDLoc dl(N); 16235 EVT VT = N->getValueType(0); 16236 16237 // Don't create instructions with illegal types after legalize types has run. 16238 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16239 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 16240 return SDValue(); 16241 16242 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 16243 if (Subtarget->hasFp256() && VT.is256BitVector() && 16244 N->getOpcode() == ISD::VECTOR_SHUFFLE) 16245 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 16246 16247 // Only handle 128 wide vector from here on. 16248 if (!VT.is128BitVector()) 16249 return SDValue(); 16250 16251 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 16252 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 16253 // consecutive, non-overlapping, and in the right order. 16254 SmallVector<SDValue, 16> Elts; 16255 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 16256 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 16257 16258 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 16259} 16260 16261/// PerformTruncateCombine - Converts truncate operation to 16262/// a sequence of vector shuffle operations. 16263/// It is possible when we truncate 256-bit vector to 128-bit vector 16264static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 16265 TargetLowering::DAGCombinerInfo &DCI, 16266 const X86Subtarget *Subtarget) { 16267 return SDValue(); 16268} 16269 16270/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 16271/// specific shuffle of a load can be folded into a single element load. 16272/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 16273/// shuffles have been customed lowered so we need to handle those here. 16274static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 16275 TargetLowering::DAGCombinerInfo &DCI) { 16276 if (DCI.isBeforeLegalizeOps()) 16277 return SDValue(); 16278 16279 SDValue InVec = N->getOperand(0); 16280 SDValue EltNo = N->getOperand(1); 16281 16282 if (!isa<ConstantSDNode>(EltNo)) 16283 return SDValue(); 16284 16285 EVT VT = InVec.getValueType(); 16286 16287 bool HasShuffleIntoBitcast = false; 16288 if (InVec.getOpcode() == ISD::BITCAST) { 16289 // Don't duplicate a load with other uses. 16290 if (!InVec.hasOneUse()) 16291 return SDValue(); 16292 EVT BCVT = InVec.getOperand(0).getValueType(); 16293 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 16294 return SDValue(); 16295 InVec = InVec.getOperand(0); 16296 HasShuffleIntoBitcast = true; 16297 } 16298 16299 if (!isTargetShuffle(InVec.getOpcode())) 16300 return SDValue(); 16301 16302 // Don't duplicate a load with other uses. 16303 if (!InVec.hasOneUse()) 16304 return SDValue(); 16305 16306 SmallVector<int, 16> ShuffleMask; 16307 bool UnaryShuffle; 16308 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 16309 UnaryShuffle)) 16310 return SDValue(); 16311 16312 // Select the input vector, guarding against out of range extract vector. 16313 unsigned NumElems = VT.getVectorNumElements(); 16314 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 16315 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 16316 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 16317 : InVec.getOperand(1); 16318 16319 // If inputs to shuffle are the same for both ops, then allow 2 uses 16320 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 16321 16322 if (LdNode.getOpcode() == ISD::BITCAST) { 16323 // Don't duplicate a load with other uses. 16324 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 16325 return SDValue(); 16326 16327 AllowedUses = 1; // only allow 1 load use if we have a bitcast 16328 LdNode = LdNode.getOperand(0); 16329 } 16330 16331 if (!ISD::isNormalLoad(LdNode.getNode())) 16332 return SDValue(); 16333 16334 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 16335 16336 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 16337 return SDValue(); 16338 16339 if (HasShuffleIntoBitcast) { 16340 // If there's a bitcast before the shuffle, check if the load type and 16341 // alignment is valid. 16342 unsigned Align = LN0->getAlignment(); 16343 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16344 unsigned NewAlign = TLI.getDataLayout()-> 16345 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 16346 16347 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 16348 return SDValue(); 16349 } 16350 16351 // All checks match so transform back to vector_shuffle so that DAG combiner 16352 // can finish the job 16353 SDLoc dl(N); 16354 16355 // Create shuffle node taking into account the case that its a unary shuffle 16356 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 16357 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 16358 InVec.getOperand(0), Shuffle, 16359 &ShuffleMask[0]); 16360 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 16361 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 16362 EltNo); 16363} 16364 16365/// Extract one bit from mask vector, like v16i1 or v8i1. 16366/// AVX-512 feature. 16367static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { 16368 SDValue Vec = N->getOperand(0); 16369 SDLoc dl(Vec); 16370 MVT VecVT = Vec.getSimpleValueType(); 16371 SDValue Idx = N->getOperand(1); 16372 MVT EltVT = N->getSimpleValueType(0); 16373 16374 assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || 16375 "Unexpected operands in ExtractBitFromMaskVector"); 16376 16377 // variable index 16378 if (!isa<ConstantSDNode>(Idx)) { 16379 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 16380 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 16381 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 16382 ExtVT.getVectorElementType(), Ext); 16383 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 16384 } 16385 16386 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 16387 16388 MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); 16389 unsigned MaxShift = VecVT.getSizeInBits() - 1; 16390 Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); 16391 Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 16392 DAG.getConstant(MaxShift - IdxVal, ScalarVT)); 16393 Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, 16394 DAG.getConstant(MaxShift, ScalarVT)); 16395 16396 if (VecVT == MVT::v16i1) { 16397 Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); 16398 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); 16399 } 16400 return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); 16401} 16402 16403/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 16404/// generation and convert it from being a bunch of shuffles and extracts 16405/// to a simple store and scalar loads to extract the elements. 16406static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 16407 TargetLowering::DAGCombinerInfo &DCI) { 16408 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 16409 if (NewOp.getNode()) 16410 return NewOp; 16411 16412 SDValue InputVector = N->getOperand(0); 16413 16414 if (InputVector.getValueType().getVectorElementType() == MVT::i1 && 16415 !DCI.isBeforeLegalize()) 16416 return ExtractBitFromMaskVector(N, DAG); 16417 16418 // Detect whether we are trying to convert from mmx to i32 and the bitcast 16419 // from mmx to v2i32 has a single usage. 16420 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && 16421 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && 16422 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) 16423 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 16424 N->getValueType(0), 16425 InputVector.getNode()->getOperand(0)); 16426 16427 // Only operate on vectors of 4 elements, where the alternative shuffling 16428 // gets to be more expensive. 16429 if (InputVector.getValueType() != MVT::v4i32) 16430 return SDValue(); 16431 16432 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 16433 // single use which is a sign-extend or zero-extend, and all elements are 16434 // used. 16435 SmallVector<SDNode *, 4> Uses; 16436 unsigned ExtractedElements = 0; 16437 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 16438 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 16439 if (UI.getUse().getResNo() != InputVector.getResNo()) 16440 return SDValue(); 16441 16442 SDNode *Extract = *UI; 16443 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16444 return SDValue(); 16445 16446 if (Extract->getValueType(0) != MVT::i32) 16447 return SDValue(); 16448 if (!Extract->hasOneUse()) 16449 return SDValue(); 16450 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 16451 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 16452 return SDValue(); 16453 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 16454 return SDValue(); 16455 16456 // Record which element was extracted. 16457 ExtractedElements |= 16458 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 16459 16460 Uses.push_back(Extract); 16461 } 16462 16463 // If not all the elements were used, this may not be worthwhile. 16464 if (ExtractedElements != 15) 16465 return SDValue(); 16466 16467 // Ok, we've now decided to do the transformation. 16468 SDLoc dl(InputVector); 16469 16470 // Store the value to a temporary stack slot. 16471 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 16472 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 16473 MachinePointerInfo(), false, false, 0); 16474 16475 // Replace each use (extract) with a load of the appropriate element. 16476 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 16477 UE = Uses.end(); UI != UE; ++UI) { 16478 SDNode *Extract = *UI; 16479 16480 // cOMpute the element's address. 16481 SDValue Idx = Extract->getOperand(1); 16482 unsigned EltSize = 16483 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 16484 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 16485 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16486 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 16487 16488 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 16489 StackPtr, OffsetVal); 16490 16491 // Load the scalar. 16492 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 16493 ScalarAddr, MachinePointerInfo(), 16494 false, false, false, 0); 16495 16496 // Replace the exact with the load. 16497 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 16498 } 16499 16500 // The replacement was made in place; don't return anything. 16501 return SDValue(); 16502} 16503 16504/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 16505static std::pair<unsigned, bool> 16506matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, 16507 SelectionDAG &DAG, const X86Subtarget *Subtarget) { 16508 if (!VT.isVector()) 16509 return std::make_pair(0, false); 16510 16511 bool NeedSplit = false; 16512 switch (VT.getSimpleVT().SimpleTy) { 16513 default: return std::make_pair(0, false); 16514 case MVT::v32i8: 16515 case MVT::v16i16: 16516 case MVT::v8i32: 16517 if (!Subtarget->hasAVX2()) 16518 NeedSplit = true; 16519 if (!Subtarget->hasAVX()) 16520 return std::make_pair(0, false); 16521 break; 16522 case MVT::v16i8: 16523 case MVT::v8i16: 16524 case MVT::v4i32: 16525 if (!Subtarget->hasSSE2()) 16526 return std::make_pair(0, false); 16527 } 16528 16529 // SSE2 has only a small subset of the operations. 16530 bool hasUnsigned = Subtarget->hasSSE41() || 16531 (Subtarget->hasSSE2() && VT == MVT::v16i8); 16532 bool hasSigned = Subtarget->hasSSE41() || 16533 (Subtarget->hasSSE2() && VT == MVT::v8i16); 16534 16535 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16536 16537 unsigned Opc = 0; 16538 // Check for x CC y ? x : y. 16539 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16540 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16541 switch (CC) { 16542 default: break; 16543 case ISD::SETULT: 16544 case ISD::SETULE: 16545 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16546 case ISD::SETUGT: 16547 case ISD::SETUGE: 16548 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16549 case ISD::SETLT: 16550 case ISD::SETLE: 16551 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16552 case ISD::SETGT: 16553 case ISD::SETGE: 16554 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16555 } 16556 // Check for x CC y ? y : x -- a min/max with reversed arms. 16557 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16558 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16559 switch (CC) { 16560 default: break; 16561 case ISD::SETULT: 16562 case ISD::SETULE: 16563 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16564 case ISD::SETUGT: 16565 case ISD::SETUGE: 16566 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16567 case ISD::SETLT: 16568 case ISD::SETLE: 16569 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16570 case ISD::SETGT: 16571 case ISD::SETGE: 16572 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16573 } 16574 } 16575 16576 return std::make_pair(Opc, NeedSplit); 16577} 16578 16579/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 16580/// nodes. 16581static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 16582 TargetLowering::DAGCombinerInfo &DCI, 16583 const X86Subtarget *Subtarget) { 16584 SDLoc DL(N); 16585 SDValue Cond = N->getOperand(0); 16586 // Get the LHS/RHS of the select. 16587 SDValue LHS = N->getOperand(1); 16588 SDValue RHS = N->getOperand(2); 16589 EVT VT = LHS.getValueType(); 16590 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16591 16592 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 16593 // instructions match the semantics of the common C idiom x<y?x:y but not 16594 // x<=y?x:y, because of how they handle negative zero (which can be 16595 // ignored in unsafe-math mode). 16596 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 16597 VT != MVT::f80 && TLI.isTypeLegal(VT) && 16598 (Subtarget->hasSSE2() || 16599 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 16600 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16601 16602 unsigned Opcode = 0; 16603 // Check for x CC y ? x : y. 16604 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16605 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16606 switch (CC) { 16607 default: break; 16608 case ISD::SETULT: 16609 // Converting this to a min would handle NaNs incorrectly, and swapping 16610 // the operands would cause it to handle comparisons between positive 16611 // and negative zero incorrectly. 16612 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16613 if (!DAG.getTarget().Options.UnsafeFPMath && 16614 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16615 break; 16616 std::swap(LHS, RHS); 16617 } 16618 Opcode = X86ISD::FMIN; 16619 break; 16620 case ISD::SETOLE: 16621 // Converting this to a min would handle comparisons between positive 16622 // and negative zero incorrectly. 16623 if (!DAG.getTarget().Options.UnsafeFPMath && 16624 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16625 break; 16626 Opcode = X86ISD::FMIN; 16627 break; 16628 case ISD::SETULE: 16629 // Converting this to a min would handle both negative zeros and NaNs 16630 // incorrectly, but we can swap the operands to fix both. 16631 std::swap(LHS, RHS); 16632 case ISD::SETOLT: 16633 case ISD::SETLT: 16634 case ISD::SETLE: 16635 Opcode = X86ISD::FMIN; 16636 break; 16637 16638 case ISD::SETOGE: 16639 // Converting this to a max would handle comparisons between positive 16640 // and negative zero incorrectly. 16641 if (!DAG.getTarget().Options.UnsafeFPMath && 16642 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16643 break; 16644 Opcode = X86ISD::FMAX; 16645 break; 16646 case ISD::SETUGT: 16647 // Converting this to a max would handle NaNs incorrectly, and swapping 16648 // the operands would cause it to handle comparisons between positive 16649 // and negative zero incorrectly. 16650 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16651 if (!DAG.getTarget().Options.UnsafeFPMath && 16652 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16653 break; 16654 std::swap(LHS, RHS); 16655 } 16656 Opcode = X86ISD::FMAX; 16657 break; 16658 case ISD::SETUGE: 16659 // Converting this to a max would handle both negative zeros and NaNs 16660 // incorrectly, but we can swap the operands to fix both. 16661 std::swap(LHS, RHS); 16662 case ISD::SETOGT: 16663 case ISD::SETGT: 16664 case ISD::SETGE: 16665 Opcode = X86ISD::FMAX; 16666 break; 16667 } 16668 // Check for x CC y ? y : x -- a min/max with reversed arms. 16669 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16670 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16671 switch (CC) { 16672 default: break; 16673 case ISD::SETOGE: 16674 // Converting this to a min would handle comparisons between positive 16675 // and negative zero incorrectly, and swapping the operands would 16676 // cause it to handle NaNs incorrectly. 16677 if (!DAG.getTarget().Options.UnsafeFPMath && 16678 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 16679 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16680 break; 16681 std::swap(LHS, RHS); 16682 } 16683 Opcode = X86ISD::FMIN; 16684 break; 16685 case ISD::SETUGT: 16686 // Converting this to a min would handle NaNs incorrectly. 16687 if (!DAG.getTarget().Options.UnsafeFPMath && 16688 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 16689 break; 16690 Opcode = X86ISD::FMIN; 16691 break; 16692 case ISD::SETUGE: 16693 // Converting this to a min would handle both negative zeros and NaNs 16694 // incorrectly, but we can swap the operands to fix both. 16695 std::swap(LHS, RHS); 16696 case ISD::SETOGT: 16697 case ISD::SETGT: 16698 case ISD::SETGE: 16699 Opcode = X86ISD::FMIN; 16700 break; 16701 16702 case ISD::SETULT: 16703 // Converting this to a max would handle NaNs incorrectly. 16704 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16705 break; 16706 Opcode = X86ISD::FMAX; 16707 break; 16708 case ISD::SETOLE: 16709 // Converting this to a max would handle comparisons between positive 16710 // and negative zero incorrectly, and swapping the operands would 16711 // cause it to handle NaNs incorrectly. 16712 if (!DAG.getTarget().Options.UnsafeFPMath && 16713 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 16714 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16715 break; 16716 std::swap(LHS, RHS); 16717 } 16718 Opcode = X86ISD::FMAX; 16719 break; 16720 case ISD::SETULE: 16721 // Converting this to a max would handle both negative zeros and NaNs 16722 // incorrectly, but we can swap the operands to fix both. 16723 std::swap(LHS, RHS); 16724 case ISD::SETOLT: 16725 case ISD::SETLT: 16726 case ISD::SETLE: 16727 Opcode = X86ISD::FMAX; 16728 break; 16729 } 16730 } 16731 16732 if (Opcode) 16733 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 16734 } 16735 16736 EVT CondVT = Cond.getValueType(); 16737 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 16738 CondVT.getVectorElementType() == MVT::i1) { 16739 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 16740 // lowering on AVX-512. In this case we convert it to 16741 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 16742 // The same situation for all 128 and 256-bit vectors of i8 and i16 16743 EVT OpVT = LHS.getValueType(); 16744 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 16745 (OpVT.getVectorElementType() == MVT::i8 || 16746 OpVT.getVectorElementType() == MVT::i16)) { 16747 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 16748 DCI.AddToWorklist(Cond.getNode()); 16749 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 16750 } 16751 } 16752 // If this is a select between two integer constants, try to do some 16753 // optimizations. 16754 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 16755 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 16756 // Don't do this for crazy integer types. 16757 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 16758 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 16759 // so that TrueC (the true value) is larger than FalseC. 16760 bool NeedsCondInvert = false; 16761 16762 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 16763 // Efficiently invertible. 16764 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 16765 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 16766 isa<ConstantSDNode>(Cond.getOperand(1))))) { 16767 NeedsCondInvert = true; 16768 std::swap(TrueC, FalseC); 16769 } 16770 16771 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 16772 if (FalseC->getAPIntValue() == 0 && 16773 TrueC->getAPIntValue().isPowerOf2()) { 16774 if (NeedsCondInvert) // Invert the condition if needed. 16775 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16776 DAG.getConstant(1, Cond.getValueType())); 16777 16778 // Zero extend the condition if needed. 16779 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 16780 16781 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 16782 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 16783 DAG.getConstant(ShAmt, MVT::i8)); 16784 } 16785 16786 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 16787 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 16788 if (NeedsCondInvert) // Invert the condition if needed. 16789 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16790 DAG.getConstant(1, Cond.getValueType())); 16791 16792 // Zero extend the condition if needed. 16793 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 16794 FalseC->getValueType(0), Cond); 16795 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16796 SDValue(FalseC, 0)); 16797 } 16798 16799 // Optimize cases that will turn into an LEA instruction. This requires 16800 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 16801 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 16802 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 16803 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 16804 16805 bool isFastMultiplier = false; 16806 if (Diff < 10) { 16807 switch ((unsigned char)Diff) { 16808 default: break; 16809 case 1: // result = add base, cond 16810 case 2: // result = lea base( , cond*2) 16811 case 3: // result = lea base(cond, cond*2) 16812 case 4: // result = lea base( , cond*4) 16813 case 5: // result = lea base(cond, cond*4) 16814 case 8: // result = lea base( , cond*8) 16815 case 9: // result = lea base(cond, cond*8) 16816 isFastMultiplier = true; 16817 break; 16818 } 16819 } 16820 16821 if (isFastMultiplier) { 16822 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 16823 if (NeedsCondInvert) // Invert the condition if needed. 16824 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16825 DAG.getConstant(1, Cond.getValueType())); 16826 16827 // Zero extend the condition if needed. 16828 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 16829 Cond); 16830 // Scale the condition by the difference. 16831 if (Diff != 1) 16832 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 16833 DAG.getConstant(Diff, Cond.getValueType())); 16834 16835 // Add the base if non-zero. 16836 if (FalseC->getAPIntValue() != 0) 16837 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16838 SDValue(FalseC, 0)); 16839 return Cond; 16840 } 16841 } 16842 } 16843 } 16844 16845 // Canonicalize max and min: 16846 // (x > y) ? x : y -> (x >= y) ? x : y 16847 // (x < y) ? x : y -> (x <= y) ? x : y 16848 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 16849 // the need for an extra compare 16850 // against zero. e.g. 16851 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 16852 // subl %esi, %edi 16853 // testl %edi, %edi 16854 // movl $0, %eax 16855 // cmovgl %edi, %eax 16856 // => 16857 // xorl %eax, %eax 16858 // subl %esi, $edi 16859 // cmovsl %eax, %edi 16860 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 16861 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16862 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16863 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16864 switch (CC) { 16865 default: break; 16866 case ISD::SETLT: 16867 case ISD::SETGT: { 16868 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 16869 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 16870 Cond.getOperand(0), Cond.getOperand(1), NewCC); 16871 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 16872 } 16873 } 16874 } 16875 16876 // Early exit check 16877 if (!TLI.isTypeLegal(VT)) 16878 return SDValue(); 16879 16880 // Match VSELECTs into subs with unsigned saturation. 16881 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16882 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 16883 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 16884 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 16885 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16886 16887 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 16888 // left side invert the predicate to simplify logic below. 16889 SDValue Other; 16890 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 16891 Other = RHS; 16892 CC = ISD::getSetCCInverse(CC, true); 16893 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 16894 Other = LHS; 16895 } 16896 16897 if (Other.getNode() && Other->getNumOperands() == 2 && 16898 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 16899 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 16900 SDValue CondRHS = Cond->getOperand(1); 16901 16902 // Look for a general sub with unsigned saturation first. 16903 // x >= y ? x-y : 0 --> subus x, y 16904 // x > y ? x-y : 0 --> subus x, y 16905 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 16906 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 16907 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16908 16909 // If the RHS is a constant we have to reverse the const canonicalization. 16910 // x > C-1 ? x+-C : 0 --> subus x, C 16911 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 16912 isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { 16913 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16914 if (CondRHS.getConstantOperandVal(0) == -A-1) 16915 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, 16916 DAG.getConstant(-A, VT)); 16917 } 16918 16919 // Another special case: If C was a sign bit, the sub has been 16920 // canonicalized into a xor. 16921 // FIXME: Would it be better to use ComputeMaskedBits to determine whether 16922 // it's safe to decanonicalize the xor? 16923 // x s< 0 ? x^C : 0 --> subus x, C 16924 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 16925 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 16926 isSplatVector(OpRHS.getNode())) { 16927 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16928 if (A.isSignBit()) 16929 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16930 } 16931 } 16932 } 16933 16934 // Try to match a min/max vector operation. 16935 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { 16936 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); 16937 unsigned Opc = ret.first; 16938 bool NeedSplit = ret.second; 16939 16940 if (Opc && NeedSplit) { 16941 unsigned NumElems = VT.getVectorNumElements(); 16942 // Extract the LHS vectors 16943 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); 16944 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); 16945 16946 // Extract the RHS vectors 16947 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); 16948 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); 16949 16950 // Create min/max for each subvector 16951 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); 16952 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); 16953 16954 // Merge the result 16955 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); 16956 } else if (Opc) 16957 return DAG.getNode(Opc, DL, VT, LHS, RHS); 16958 } 16959 16960 // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. 16961 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16962 // Check if SETCC has already been promoted 16963 TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { 16964 16965 assert(Cond.getValueType().isVector() && 16966 "vector select expects a vector selector!"); 16967 16968 EVT IntVT = Cond.getValueType(); 16969 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 16970 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 16971 16972 if (!TValIsAllOnes && !FValIsAllZeros) { 16973 // Try invert the condition if true value is not all 1s and false value 16974 // is not all 0s. 16975 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 16976 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 16977 16978 if (TValIsAllZeros || FValIsAllOnes) { 16979 SDValue CC = Cond.getOperand(2); 16980 ISD::CondCode NewCC = 16981 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 16982 Cond.getOperand(0).getValueType().isInteger()); 16983 Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 16984 std::swap(LHS, RHS); 16985 TValIsAllOnes = FValIsAllOnes; 16986 FValIsAllZeros = TValIsAllZeros; 16987 } 16988 } 16989 16990 if (TValIsAllOnes || FValIsAllZeros) { 16991 SDValue Ret; 16992 16993 if (TValIsAllOnes && FValIsAllZeros) 16994 Ret = Cond; 16995 else if (TValIsAllOnes) 16996 Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, 16997 DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); 16998 else if (FValIsAllZeros) 16999 Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, 17000 DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); 17001 17002 return DAG.getNode(ISD::BITCAST, DL, VT, Ret); 17003 } 17004 } 17005 17006 // If we know that this node is legal then we know that it is going to be 17007 // matched by one of the SSE/AVX BLEND instructions. These instructions only 17008 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 17009 // to simplify previous instructions. 17010 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 17011 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 17012 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 17013 17014 // Don't optimize vector selects that map to mask-registers. 17015 if (BitWidth == 1) 17016 return SDValue(); 17017 17018 // Check all uses of that condition operand to check whether it will be 17019 // consumed by non-BLEND instructions, which may depend on all bits are set 17020 // properly. 17021 for (SDNode::use_iterator I = Cond->use_begin(), 17022 E = Cond->use_end(); I != E; ++I) 17023 if (I->getOpcode() != ISD::VSELECT) 17024 // TODO: Add other opcodes eventually lowered into BLEND. 17025 return SDValue(); 17026 17027 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 17028 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 17029 17030 APInt KnownZero, KnownOne; 17031 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 17032 DCI.isBeforeLegalizeOps()); 17033 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 17034 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 17035 DCI.CommitTargetLoweringOpt(TLO); 17036 } 17037 17038 return SDValue(); 17039} 17040 17041// Check whether a boolean test is testing a boolean value generated by 17042// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 17043// code. 17044// 17045// Simplify the following patterns: 17046// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 17047// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 17048// to (Op EFLAGS Cond) 17049// 17050// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 17051// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 17052// to (Op EFLAGS !Cond) 17053// 17054// where Op could be BRCOND or CMOV. 17055// 17056static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 17057 // Quit if not CMP and SUB with its value result used. 17058 if (Cmp.getOpcode() != X86ISD::CMP && 17059 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 17060 return SDValue(); 17061 17062 // Quit if not used as a boolean value. 17063 if (CC != X86::COND_E && CC != X86::COND_NE) 17064 return SDValue(); 17065 17066 // Check CMP operands. One of them should be 0 or 1 and the other should be 17067 // an SetCC or extended from it. 17068 SDValue Op1 = Cmp.getOperand(0); 17069 SDValue Op2 = Cmp.getOperand(1); 17070 17071 SDValue SetCC; 17072 const ConstantSDNode* C = 0; 17073 bool needOppositeCond = (CC == X86::COND_E); 17074 bool checkAgainstTrue = false; // Is it a comparison against 1? 17075 17076 if ((C = dyn_cast<ConstantSDNode>(Op1))) 17077 SetCC = Op2; 17078 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 17079 SetCC = Op1; 17080 else // Quit if all operands are not constants. 17081 return SDValue(); 17082 17083 if (C->getZExtValue() == 1) { 17084 needOppositeCond = !needOppositeCond; 17085 checkAgainstTrue = true; 17086 } else if (C->getZExtValue() != 0) 17087 // Quit if the constant is neither 0 or 1. 17088 return SDValue(); 17089 17090 bool truncatedToBoolWithAnd = false; 17091 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 17092 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 17093 SetCC.getOpcode() == ISD::TRUNCATE || 17094 SetCC.getOpcode() == ISD::AND) { 17095 if (SetCC.getOpcode() == ISD::AND) { 17096 int OpIdx = -1; 17097 ConstantSDNode *CS; 17098 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && 17099 CS->getZExtValue() == 1) 17100 OpIdx = 1; 17101 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && 17102 CS->getZExtValue() == 1) 17103 OpIdx = 0; 17104 if (OpIdx == -1) 17105 break; 17106 SetCC = SetCC.getOperand(OpIdx); 17107 truncatedToBoolWithAnd = true; 17108 } else 17109 SetCC = SetCC.getOperand(0); 17110 } 17111 17112 switch (SetCC.getOpcode()) { 17113 case X86ISD::SETCC_CARRY: 17114 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 17115 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 17116 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 17117 // truncated to i1 using 'and'. 17118 if (checkAgainstTrue && !truncatedToBoolWithAnd) 17119 break; 17120 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 17121 "Invalid use of SETCC_CARRY!"); 17122 // FALL THROUGH 17123 case X86ISD::SETCC: 17124 // Set the condition code or opposite one if necessary. 17125 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 17126 if (needOppositeCond) 17127 CC = X86::GetOppositeBranchCondition(CC); 17128 return SetCC.getOperand(1); 17129 case X86ISD::CMOV: { 17130 // Check whether false/true value has canonical one, i.e. 0 or 1. 17131 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 17132 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 17133 // Quit if true value is not a constant. 17134 if (!TVal) 17135 return SDValue(); 17136 // Quit if false value is not a constant. 17137 if (!FVal) { 17138 SDValue Op = SetCC.getOperand(0); 17139 // Skip 'zext' or 'trunc' node. 17140 if (Op.getOpcode() == ISD::ZERO_EXTEND || 17141 Op.getOpcode() == ISD::TRUNCATE) 17142 Op = Op.getOperand(0); 17143 // A special case for rdrand/rdseed, where 0 is set if false cond is 17144 // found. 17145 if ((Op.getOpcode() != X86ISD::RDRAND && 17146 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 17147 return SDValue(); 17148 } 17149 // Quit if false value is not the constant 0 or 1. 17150 bool FValIsFalse = true; 17151 if (FVal && FVal->getZExtValue() != 0) { 17152 if (FVal->getZExtValue() != 1) 17153 return SDValue(); 17154 // If FVal is 1, opposite cond is needed. 17155 needOppositeCond = !needOppositeCond; 17156 FValIsFalse = false; 17157 } 17158 // Quit if TVal is not the constant opposite of FVal. 17159 if (FValIsFalse && TVal->getZExtValue() != 1) 17160 return SDValue(); 17161 if (!FValIsFalse && TVal->getZExtValue() != 0) 17162 return SDValue(); 17163 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 17164 if (needOppositeCond) 17165 CC = X86::GetOppositeBranchCondition(CC); 17166 return SetCC.getOperand(3); 17167 } 17168 } 17169 17170 return SDValue(); 17171} 17172 17173/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 17174static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 17175 TargetLowering::DAGCombinerInfo &DCI, 17176 const X86Subtarget *Subtarget) { 17177 SDLoc DL(N); 17178 17179 // If the flag operand isn't dead, don't touch this CMOV. 17180 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 17181 return SDValue(); 17182 17183 SDValue FalseOp = N->getOperand(0); 17184 SDValue TrueOp = N->getOperand(1); 17185 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 17186 SDValue Cond = N->getOperand(3); 17187 17188 if (CC == X86::COND_E || CC == X86::COND_NE) { 17189 switch (Cond.getOpcode()) { 17190 default: break; 17191 case X86ISD::BSR: 17192 case X86ISD::BSF: 17193 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 17194 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 17195 return (CC == X86::COND_E) ? FalseOp : TrueOp; 17196 } 17197 } 17198 17199 SDValue Flags; 17200 17201 Flags = checkBoolTestSetCCCombine(Cond, CC); 17202 if (Flags.getNode() && 17203 // Extra check as FCMOV only supports a subset of X86 cond. 17204 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 17205 SDValue Ops[] = { FalseOp, TrueOp, 17206 DAG.getConstant(CC, MVT::i8), Flags }; 17207 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 17208 Ops, array_lengthof(Ops)); 17209 } 17210 17211 // If this is a select between two integer constants, try to do some 17212 // optimizations. Note that the operands are ordered the opposite of SELECT 17213 // operands. 17214 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 17215 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 17216 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 17217 // larger than FalseC (the false value). 17218 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 17219 CC = X86::GetOppositeBranchCondition(CC); 17220 std::swap(TrueC, FalseC); 17221 std::swap(TrueOp, FalseOp); 17222 } 17223 17224 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 17225 // This is efficient for any integer data type (including i8/i16) and 17226 // shift amount. 17227 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 17228 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17229 DAG.getConstant(CC, MVT::i8), Cond); 17230 17231 // Zero extend the condition if needed. 17232 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 17233 17234 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 17235 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 17236 DAG.getConstant(ShAmt, MVT::i8)); 17237 if (N->getNumValues() == 2) // Dead flag value? 17238 return DCI.CombineTo(N, Cond, SDValue()); 17239 return Cond; 17240 } 17241 17242 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 17243 // for any integer data type, including i8/i16. 17244 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 17245 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17246 DAG.getConstant(CC, MVT::i8), Cond); 17247 17248 // Zero extend the condition if needed. 17249 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 17250 FalseC->getValueType(0), Cond); 17251 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17252 SDValue(FalseC, 0)); 17253 17254 if (N->getNumValues() == 2) // Dead flag value? 17255 return DCI.CombineTo(N, Cond, SDValue()); 17256 return Cond; 17257 } 17258 17259 // Optimize cases that will turn into an LEA instruction. This requires 17260 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 17261 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 17262 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 17263 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 17264 17265 bool isFastMultiplier = false; 17266 if (Diff < 10) { 17267 switch ((unsigned char)Diff) { 17268 default: break; 17269 case 1: // result = add base, cond 17270 case 2: // result = lea base( , cond*2) 17271 case 3: // result = lea base(cond, cond*2) 17272 case 4: // result = lea base( , cond*4) 17273 case 5: // result = lea base(cond, cond*4) 17274 case 8: // result = lea base( , cond*8) 17275 case 9: // result = lea base(cond, cond*8) 17276 isFastMultiplier = true; 17277 break; 17278 } 17279 } 17280 17281 if (isFastMultiplier) { 17282 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 17283 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17284 DAG.getConstant(CC, MVT::i8), Cond); 17285 // Zero extend the condition if needed. 17286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 17287 Cond); 17288 // Scale the condition by the difference. 17289 if (Diff != 1) 17290 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 17291 DAG.getConstant(Diff, Cond.getValueType())); 17292 17293 // Add the base if non-zero. 17294 if (FalseC->getAPIntValue() != 0) 17295 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17296 SDValue(FalseC, 0)); 17297 if (N->getNumValues() == 2) // Dead flag value? 17298 return DCI.CombineTo(N, Cond, SDValue()); 17299 return Cond; 17300 } 17301 } 17302 } 17303 } 17304 17305 // Handle these cases: 17306 // (select (x != c), e, c) -> select (x != c), e, x), 17307 // (select (x == c), c, e) -> select (x == c), x, e) 17308 // where the c is an integer constant, and the "select" is the combination 17309 // of CMOV and CMP. 17310 // 17311 // The rationale for this change is that the conditional-move from a constant 17312 // needs two instructions, however, conditional-move from a register needs 17313 // only one instruction. 17314 // 17315 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 17316 // some instruction-combining opportunities. This opt needs to be 17317 // postponed as late as possible. 17318 // 17319 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 17320 // the DCI.xxxx conditions are provided to postpone the optimization as 17321 // late as possible. 17322 17323 ConstantSDNode *CmpAgainst = 0; 17324 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 17325 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 17326 !isa<ConstantSDNode>(Cond.getOperand(0))) { 17327 17328 if (CC == X86::COND_NE && 17329 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 17330 CC = X86::GetOppositeBranchCondition(CC); 17331 std::swap(TrueOp, FalseOp); 17332 } 17333 17334 if (CC == X86::COND_E && 17335 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 17336 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 17337 DAG.getConstant(CC, MVT::i8), Cond }; 17338 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, 17339 array_lengthof(Ops)); 17340 } 17341 } 17342 } 17343 17344 return SDValue(); 17345} 17346 17347/// PerformMulCombine - Optimize a single multiply with constant into two 17348/// in order to implement it with two cheaper instructions, e.g. 17349/// LEA + SHL, LEA + LEA. 17350static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 17351 TargetLowering::DAGCombinerInfo &DCI) { 17352 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 17353 return SDValue(); 17354 17355 EVT VT = N->getValueType(0); 17356 if (VT != MVT::i64) 17357 return SDValue(); 17358 17359 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17360 if (!C) 17361 return SDValue(); 17362 uint64_t MulAmt = C->getZExtValue(); 17363 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 17364 return SDValue(); 17365 17366 uint64_t MulAmt1 = 0; 17367 uint64_t MulAmt2 = 0; 17368 if ((MulAmt % 9) == 0) { 17369 MulAmt1 = 9; 17370 MulAmt2 = MulAmt / 9; 17371 } else if ((MulAmt % 5) == 0) { 17372 MulAmt1 = 5; 17373 MulAmt2 = MulAmt / 5; 17374 } else if ((MulAmt % 3) == 0) { 17375 MulAmt1 = 3; 17376 MulAmt2 = MulAmt / 3; 17377 } 17378 if (MulAmt2 && 17379 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 17380 SDLoc DL(N); 17381 17382 if (isPowerOf2_64(MulAmt2) && 17383 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 17384 // If second multiplifer is pow2, issue it first. We want the multiply by 17385 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 17386 // is an add. 17387 std::swap(MulAmt1, MulAmt2); 17388 17389 SDValue NewMul; 17390 if (isPowerOf2_64(MulAmt1)) 17391 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 17392 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 17393 else 17394 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 17395 DAG.getConstant(MulAmt1, VT)); 17396 17397 if (isPowerOf2_64(MulAmt2)) 17398 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 17399 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 17400 else 17401 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 17402 DAG.getConstant(MulAmt2, VT)); 17403 17404 // Do not add new nodes to DAG combiner worklist. 17405 DCI.CombineTo(N, NewMul, false); 17406 } 17407 return SDValue(); 17408} 17409 17410static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 17411 SDValue N0 = N->getOperand(0); 17412 SDValue N1 = N->getOperand(1); 17413 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 17414 EVT VT = N0.getValueType(); 17415 17416 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 17417 // since the result of setcc_c is all zero's or all ones. 17418 if (VT.isInteger() && !VT.isVector() && 17419 N1C && N0.getOpcode() == ISD::AND && 17420 N0.getOperand(1).getOpcode() == ISD::Constant) { 17421 SDValue N00 = N0.getOperand(0); 17422 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 17423 ((N00.getOpcode() == ISD::ANY_EXTEND || 17424 N00.getOpcode() == ISD::ZERO_EXTEND) && 17425 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 17426 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 17427 APInt ShAmt = N1C->getAPIntValue(); 17428 Mask = Mask.shl(ShAmt); 17429 if (Mask != 0) 17430 return DAG.getNode(ISD::AND, SDLoc(N), VT, 17431 N00, DAG.getConstant(Mask, VT)); 17432 } 17433 } 17434 17435 // Hardware support for vector shifts is sparse which makes us scalarize the 17436 // vector operations in many cases. Also, on sandybridge ADD is faster than 17437 // shl. 17438 // (shl V, 1) -> add V,V 17439 if (isSplatVector(N1.getNode())) { 17440 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 17441 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 17442 // We shift all of the values by one. In many cases we do not have 17443 // hardware support for this operation. This is better expressed as an ADD 17444 // of two values. 17445 if (N1C && (1 == N1C->getZExtValue())) { 17446 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 17447 } 17448 } 17449 17450 return SDValue(); 17451} 17452 17453/// \brief Returns a vector of 0s if the node in input is a vector logical 17454/// shift by a constant amount which is known to be bigger than or equal 17455/// to the vector element size in bits. 17456static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 17457 const X86Subtarget *Subtarget) { 17458 EVT VT = N->getValueType(0); 17459 17460 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 17461 (!Subtarget->hasInt256() || 17462 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 17463 return SDValue(); 17464 17465 SDValue Amt = N->getOperand(1); 17466 SDLoc DL(N); 17467 if (isSplatVector(Amt.getNode())) { 17468 SDValue SclrAmt = Amt->getOperand(0); 17469 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 17470 APInt ShiftAmt = C->getAPIntValue(); 17471 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); 17472 17473 // SSE2/AVX2 logical shifts always return a vector of 0s 17474 // if the shift amount is bigger than or equal to 17475 // the element size. The constant shift amount will be 17476 // encoded as a 8-bit immediate. 17477 if (ShiftAmt.trunc(8).uge(MaxAmount)) 17478 return getZeroVector(VT, Subtarget, DAG, DL); 17479 } 17480 } 17481 17482 return SDValue(); 17483} 17484 17485/// PerformShiftCombine - Combine shifts. 17486static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 17487 TargetLowering::DAGCombinerInfo &DCI, 17488 const X86Subtarget *Subtarget) { 17489 if (N->getOpcode() == ISD::SHL) { 17490 SDValue V = PerformSHLCombine(N, DAG); 17491 if (V.getNode()) return V; 17492 } 17493 17494 if (N->getOpcode() != ISD::SRA) { 17495 // Try to fold this logical shift into a zero vector. 17496 SDValue V = performShiftToAllZeros(N, DAG, Subtarget); 17497 if (V.getNode()) return V; 17498 } 17499 17500 return SDValue(); 17501} 17502 17503// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 17504// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 17505// and friends. Likewise for OR -> CMPNEQSS. 17506static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 17507 TargetLowering::DAGCombinerInfo &DCI, 17508 const X86Subtarget *Subtarget) { 17509 unsigned opcode; 17510 17511 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 17512 // we're requiring SSE2 for both. 17513 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 17514 SDValue N0 = N->getOperand(0); 17515 SDValue N1 = N->getOperand(1); 17516 SDValue CMP0 = N0->getOperand(1); 17517 SDValue CMP1 = N1->getOperand(1); 17518 SDLoc DL(N); 17519 17520 // The SETCCs should both refer to the same CMP. 17521 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 17522 return SDValue(); 17523 17524 SDValue CMP00 = CMP0->getOperand(0); 17525 SDValue CMP01 = CMP0->getOperand(1); 17526 EVT VT = CMP00.getValueType(); 17527 17528 if (VT == MVT::f32 || VT == MVT::f64) { 17529 bool ExpectingFlags = false; 17530 // Check for any users that want flags: 17531 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 17532 !ExpectingFlags && UI != UE; ++UI) 17533 switch (UI->getOpcode()) { 17534 default: 17535 case ISD::BR_CC: 17536 case ISD::BRCOND: 17537 case ISD::SELECT: 17538 ExpectingFlags = true; 17539 break; 17540 case ISD::CopyToReg: 17541 case ISD::SIGN_EXTEND: 17542 case ISD::ZERO_EXTEND: 17543 case ISD::ANY_EXTEND: 17544 break; 17545 } 17546 17547 if (!ExpectingFlags) { 17548 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 17549 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 17550 17551 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 17552 X86::CondCode tmp = cc0; 17553 cc0 = cc1; 17554 cc1 = tmp; 17555 } 17556 17557 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 17558 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 17559 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 17560 X86ISD::NodeType NTOperator = is64BitFP ? 17561 X86ISD::FSETCCsd : X86ISD::FSETCCss; 17562 // FIXME: need symbolic constants for these magic numbers. 17563 // See X86ATTInstPrinter.cpp:printSSECC(). 17564 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 17565 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 17566 DAG.getConstant(x86cc, MVT::i8)); 17567 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 17568 OnesOrZeroesF); 17569 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 17570 DAG.getConstant(1, MVT::i32)); 17571 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 17572 return OneBitOfTruth; 17573 } 17574 } 17575 } 17576 } 17577 return SDValue(); 17578} 17579 17580/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 17581/// so it can be folded inside ANDNP. 17582static bool CanFoldXORWithAllOnes(const SDNode *N) { 17583 EVT VT = N->getValueType(0); 17584 17585 // Match direct AllOnes for 128 and 256-bit vectors 17586 if (ISD::isBuildVectorAllOnes(N)) 17587 return true; 17588 17589 // Look through a bit convert. 17590 if (N->getOpcode() == ISD::BITCAST) 17591 N = N->getOperand(0).getNode(); 17592 17593 // Sometimes the operand may come from a insert_subvector building a 256-bit 17594 // allones vector 17595 if (VT.is256BitVector() && 17596 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 17597 SDValue V1 = N->getOperand(0); 17598 SDValue V2 = N->getOperand(1); 17599 17600 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 17601 V1.getOperand(0).getOpcode() == ISD::UNDEF && 17602 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 17603 ISD::isBuildVectorAllOnes(V2.getNode())) 17604 return true; 17605 } 17606 17607 return false; 17608} 17609 17610// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 17611// register. In most cases we actually compare or select YMM-sized registers 17612// and mixing the two types creates horrible code. This method optimizes 17613// some of the transition sequences. 17614static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 17615 TargetLowering::DAGCombinerInfo &DCI, 17616 const X86Subtarget *Subtarget) { 17617 EVT VT = N->getValueType(0); 17618 if (!VT.is256BitVector()) 17619 return SDValue(); 17620 17621 assert((N->getOpcode() == ISD::ANY_EXTEND || 17622 N->getOpcode() == ISD::ZERO_EXTEND || 17623 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 17624 17625 SDValue Narrow = N->getOperand(0); 17626 EVT NarrowVT = Narrow->getValueType(0); 17627 if (!NarrowVT.is128BitVector()) 17628 return SDValue(); 17629 17630 if (Narrow->getOpcode() != ISD::XOR && 17631 Narrow->getOpcode() != ISD::AND && 17632 Narrow->getOpcode() != ISD::OR) 17633 return SDValue(); 17634 17635 SDValue N0 = Narrow->getOperand(0); 17636 SDValue N1 = Narrow->getOperand(1); 17637 SDLoc DL(Narrow); 17638 17639 // The Left side has to be a trunc. 17640 if (N0.getOpcode() != ISD::TRUNCATE) 17641 return SDValue(); 17642 17643 // The type of the truncated inputs. 17644 EVT WideVT = N0->getOperand(0)->getValueType(0); 17645 if (WideVT != VT) 17646 return SDValue(); 17647 17648 // The right side has to be a 'trunc' or a constant vector. 17649 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 17650 bool RHSConst = (isSplatVector(N1.getNode()) && 17651 isa<ConstantSDNode>(N1->getOperand(0))); 17652 if (!RHSTrunc && !RHSConst) 17653 return SDValue(); 17654 17655 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17656 17657 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 17658 return SDValue(); 17659 17660 // Set N0 and N1 to hold the inputs to the new wide operation. 17661 N0 = N0->getOperand(0); 17662 if (RHSConst) { 17663 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 17664 N1->getOperand(0)); 17665 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 17666 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); 17667 } else if (RHSTrunc) { 17668 N1 = N1->getOperand(0); 17669 } 17670 17671 // Generate the wide operation. 17672 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 17673 unsigned Opcode = N->getOpcode(); 17674 switch (Opcode) { 17675 case ISD::ANY_EXTEND: 17676 return Op; 17677 case ISD::ZERO_EXTEND: { 17678 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 17679 APInt Mask = APInt::getAllOnesValue(InBits); 17680 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 17681 return DAG.getNode(ISD::AND, DL, VT, 17682 Op, DAG.getConstant(Mask, VT)); 17683 } 17684 case ISD::SIGN_EXTEND: 17685 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 17686 Op, DAG.getValueType(NarrowVT)); 17687 default: 17688 llvm_unreachable("Unexpected opcode"); 17689 } 17690} 17691 17692static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 17693 TargetLowering::DAGCombinerInfo &DCI, 17694 const X86Subtarget *Subtarget) { 17695 EVT VT = N->getValueType(0); 17696 if (DCI.isBeforeLegalizeOps()) 17697 return SDValue(); 17698 17699 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17700 if (R.getNode()) 17701 return R; 17702 17703 // Create BLSI, BLSR, and BZHI instructions 17704 // BLSI is X & (-X) 17705 // BLSR is X & (X-1) 17706 // BZHI is X & ((1 << Y) - 1) 17707 // BEXTR is ((X >> imm) & (2**size-1)) 17708 if (VT == MVT::i32 || VT == MVT::i64) { 17709 SDValue N0 = N->getOperand(0); 17710 SDValue N1 = N->getOperand(1); 17711 SDLoc DL(N); 17712 17713 if (Subtarget->hasBMI()) { 17714 // Check LHS for neg 17715 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 17716 isZero(N0.getOperand(0))) 17717 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 17718 17719 // Check RHS for neg 17720 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 17721 isZero(N1.getOperand(0))) 17722 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 17723 17724 // Check LHS for X-1 17725 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 17726 isAllOnes(N0.getOperand(1))) 17727 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 17728 17729 // Check RHS for X-1 17730 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 17731 isAllOnes(N1.getOperand(1))) 17732 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 17733 } 17734 17735 if (Subtarget->hasBMI2()) { 17736 // Check for (and (add (shl 1, Y), -1), X) 17737 if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { 17738 SDValue N00 = N0.getOperand(0); 17739 if (N00.getOpcode() == ISD::SHL) { 17740 SDValue N001 = N00.getOperand(1); 17741 assert(N001.getValueType() == MVT::i8 && "unexpected type"); 17742 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); 17743 if (C && C->getZExtValue() == 1) 17744 return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); 17745 } 17746 } 17747 17748 // Check for (and X, (add (shl 1, Y), -1)) 17749 if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { 17750 SDValue N10 = N1.getOperand(0); 17751 if (N10.getOpcode() == ISD::SHL) { 17752 SDValue N101 = N10.getOperand(1); 17753 assert(N101.getValueType() == MVT::i8 && "unexpected type"); 17754 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); 17755 if (C && C->getZExtValue() == 1) 17756 return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); 17757 } 17758 } 17759 } 17760 17761 // Check for BEXTR. 17762 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 17763 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 17764 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 17765 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 17766 if (MaskNode && ShiftNode) { 17767 uint64_t Mask = MaskNode->getZExtValue(); 17768 uint64_t Shift = ShiftNode->getZExtValue(); 17769 if (isMask_64(Mask)) { 17770 uint64_t MaskSize = CountPopulation_64(Mask); 17771 if (Shift + MaskSize <= VT.getSizeInBits()) 17772 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 17773 DAG.getConstant(Shift | (MaskSize << 8), VT)); 17774 } 17775 } 17776 } // BEXTR 17777 17778 return SDValue(); 17779 } 17780 17781 // Want to form ANDNP nodes: 17782 // 1) In the hopes of then easily combining them with OR and AND nodes 17783 // to form PBLEND/PSIGN. 17784 // 2) To match ANDN packed intrinsics 17785 if (VT != MVT::v2i64 && VT != MVT::v4i64) 17786 return SDValue(); 17787 17788 SDValue N0 = N->getOperand(0); 17789 SDValue N1 = N->getOperand(1); 17790 SDLoc DL(N); 17791 17792 // Check LHS for vnot 17793 if (N0.getOpcode() == ISD::XOR && 17794 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 17795 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 17796 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 17797 17798 // Check RHS for vnot 17799 if (N1.getOpcode() == ISD::XOR && 17800 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 17801 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 17802 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 17803 17804 return SDValue(); 17805} 17806 17807static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 17808 TargetLowering::DAGCombinerInfo &DCI, 17809 const X86Subtarget *Subtarget) { 17810 EVT VT = N->getValueType(0); 17811 if (DCI.isBeforeLegalizeOps()) 17812 return SDValue(); 17813 17814 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17815 if (R.getNode()) 17816 return R; 17817 17818 SDValue N0 = N->getOperand(0); 17819 SDValue N1 = N->getOperand(1); 17820 17821 // look for psign/blend 17822 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 17823 if (!Subtarget->hasSSSE3() || 17824 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 17825 return SDValue(); 17826 17827 // Canonicalize pandn to RHS 17828 if (N0.getOpcode() == X86ISD::ANDNP) 17829 std::swap(N0, N1); 17830 // or (and (m, y), (pandn m, x)) 17831 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 17832 SDValue Mask = N1.getOperand(0); 17833 SDValue X = N1.getOperand(1); 17834 SDValue Y; 17835 if (N0.getOperand(0) == Mask) 17836 Y = N0.getOperand(1); 17837 if (N0.getOperand(1) == Mask) 17838 Y = N0.getOperand(0); 17839 17840 // Check to see if the mask appeared in both the AND and ANDNP and 17841 if (!Y.getNode()) 17842 return SDValue(); 17843 17844 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 17845 // Look through mask bitcast. 17846 if (Mask.getOpcode() == ISD::BITCAST) 17847 Mask = Mask.getOperand(0); 17848 if (X.getOpcode() == ISD::BITCAST) 17849 X = X.getOperand(0); 17850 if (Y.getOpcode() == ISD::BITCAST) 17851 Y = Y.getOperand(0); 17852 17853 EVT MaskVT = Mask.getValueType(); 17854 17855 // Validate that the Mask operand is a vector sra node. 17856 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 17857 // there is no psrai.b 17858 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 17859 unsigned SraAmt = ~0; 17860 if (Mask.getOpcode() == ISD::SRA) { 17861 SDValue Amt = Mask.getOperand(1); 17862 if (isSplatVector(Amt.getNode())) { 17863 SDValue SclrAmt = Amt->getOperand(0); 17864 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) 17865 SraAmt = C->getZExtValue(); 17866 } 17867 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 17868 SDValue SraC = Mask.getOperand(1); 17869 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 17870 } 17871 if ((SraAmt + 1) != EltBits) 17872 return SDValue(); 17873 17874 SDLoc DL(N); 17875 17876 // Now we know we at least have a plendvb with the mask val. See if 17877 // we can form a psignb/w/d. 17878 // psign = x.type == y.type == mask.type && y = sub(0, x); 17879 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 17880 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 17881 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 17882 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 17883 "Unsupported VT for PSIGN"); 17884 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 17885 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17886 } 17887 // PBLENDVB only available on SSE 4.1 17888 if (!Subtarget->hasSSE41()) 17889 return SDValue(); 17890 17891 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 17892 17893 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 17894 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 17895 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 17896 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 17897 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17898 } 17899 } 17900 17901 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 17902 return SDValue(); 17903 17904 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 17905 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 17906 std::swap(N0, N1); 17907 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 17908 return SDValue(); 17909 if (!N0.hasOneUse() || !N1.hasOneUse()) 17910 return SDValue(); 17911 17912 SDValue ShAmt0 = N0.getOperand(1); 17913 if (ShAmt0.getValueType() != MVT::i8) 17914 return SDValue(); 17915 SDValue ShAmt1 = N1.getOperand(1); 17916 if (ShAmt1.getValueType() != MVT::i8) 17917 return SDValue(); 17918 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 17919 ShAmt0 = ShAmt0.getOperand(0); 17920 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 17921 ShAmt1 = ShAmt1.getOperand(0); 17922 17923 SDLoc DL(N); 17924 unsigned Opc = X86ISD::SHLD; 17925 SDValue Op0 = N0.getOperand(0); 17926 SDValue Op1 = N1.getOperand(0); 17927 if (ShAmt0.getOpcode() == ISD::SUB) { 17928 Opc = X86ISD::SHRD; 17929 std::swap(Op0, Op1); 17930 std::swap(ShAmt0, ShAmt1); 17931 } 17932 17933 unsigned Bits = VT.getSizeInBits(); 17934 if (ShAmt1.getOpcode() == ISD::SUB) { 17935 SDValue Sum = ShAmt1.getOperand(0); 17936 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 17937 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 17938 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 17939 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 17940 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 17941 return DAG.getNode(Opc, DL, VT, 17942 Op0, Op1, 17943 DAG.getNode(ISD::TRUNCATE, DL, 17944 MVT::i8, ShAmt0)); 17945 } 17946 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 17947 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 17948 if (ShAmt0C && 17949 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 17950 return DAG.getNode(Opc, DL, VT, 17951 N0.getOperand(0), N1.getOperand(0), 17952 DAG.getNode(ISD::TRUNCATE, DL, 17953 MVT::i8, ShAmt0)); 17954 } 17955 17956 return SDValue(); 17957} 17958 17959// Generate NEG and CMOV for integer abs. 17960static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 17961 EVT VT = N->getValueType(0); 17962 17963 // Since X86 does not have CMOV for 8-bit integer, we don't convert 17964 // 8-bit integer abs to NEG and CMOV. 17965 if (VT.isInteger() && VT.getSizeInBits() == 8) 17966 return SDValue(); 17967 17968 SDValue N0 = N->getOperand(0); 17969 SDValue N1 = N->getOperand(1); 17970 SDLoc DL(N); 17971 17972 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 17973 // and change it to SUB and CMOV. 17974 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 17975 N0.getOpcode() == ISD::ADD && 17976 N0.getOperand(1) == N1 && 17977 N1.getOpcode() == ISD::SRA && 17978 N1.getOperand(0) == N0.getOperand(0)) 17979 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 17980 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 17981 // Generate SUB & CMOV. 17982 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 17983 DAG.getConstant(0, VT), N0.getOperand(0)); 17984 17985 SDValue Ops[] = { N0.getOperand(0), Neg, 17986 DAG.getConstant(X86::COND_GE, MVT::i8), 17987 SDValue(Neg.getNode(), 1) }; 17988 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 17989 Ops, array_lengthof(Ops)); 17990 } 17991 return SDValue(); 17992} 17993 17994// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 17995static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 17996 TargetLowering::DAGCombinerInfo &DCI, 17997 const X86Subtarget *Subtarget) { 17998 EVT VT = N->getValueType(0); 17999 if (DCI.isBeforeLegalizeOps()) 18000 return SDValue(); 18001 18002 if (Subtarget->hasCMov()) { 18003 SDValue RV = performIntegerAbsCombine(N, DAG); 18004 if (RV.getNode()) 18005 return RV; 18006 } 18007 18008 // Try forming BMI if it is available. 18009 if (!Subtarget->hasBMI()) 18010 return SDValue(); 18011 18012 if (VT != MVT::i32 && VT != MVT::i64) 18013 return SDValue(); 18014 18015 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 18016 18017 // Create BLSMSK instructions by finding X ^ (X-1) 18018 SDValue N0 = N->getOperand(0); 18019 SDValue N1 = N->getOperand(1); 18020 SDLoc DL(N); 18021 18022 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 18023 isAllOnes(N0.getOperand(1))) 18024 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 18025 18026 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 18027 isAllOnes(N1.getOperand(1))) 18028 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 18029 18030 return SDValue(); 18031} 18032 18033/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 18034static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 18035 TargetLowering::DAGCombinerInfo &DCI, 18036 const X86Subtarget *Subtarget) { 18037 LoadSDNode *Ld = cast<LoadSDNode>(N); 18038 EVT RegVT = Ld->getValueType(0); 18039 EVT MemVT = Ld->getMemoryVT(); 18040 SDLoc dl(Ld); 18041 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18042 unsigned RegSz = RegVT.getSizeInBits(); 18043 18044 // On Sandybridge unaligned 256bit loads are inefficient. 18045 ISD::LoadExtType Ext = Ld->getExtensionType(); 18046 unsigned Alignment = Ld->getAlignment(); 18047 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; 18048 if (RegVT.is256BitVector() && !Subtarget->hasInt256() && 18049 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { 18050 unsigned NumElems = RegVT.getVectorNumElements(); 18051 if (NumElems < 2) 18052 return SDValue(); 18053 18054 SDValue Ptr = Ld->getBasePtr(); 18055 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); 18056 18057 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18058 NumElems/2); 18059 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18060 Ld->getPointerInfo(), Ld->isVolatile(), 18061 Ld->isNonTemporal(), Ld->isInvariant(), 18062 Alignment); 18063 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18064 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18065 Ld->getPointerInfo(), Ld->isVolatile(), 18066 Ld->isNonTemporal(), Ld->isInvariant(), 18067 std::min(16U, Alignment)); 18068 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 18069 Load1.getValue(1), 18070 Load2.getValue(1)); 18071 18072 SDValue NewVec = DAG.getUNDEF(RegVT); 18073 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 18074 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 18075 return DCI.CombineTo(N, NewVec, TF, true); 18076 } 18077 18078 // If this is a vector EXT Load then attempt to optimize it using a 18079 // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the 18080 // expansion is still better than scalar code. 18081 // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll 18082 // emit a shuffle and a arithmetic shift. 18083 // TODO: It is possible to support ZExt by zeroing the undef values 18084 // during the shuffle phase or after the shuffle. 18085 if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && 18086 (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { 18087 assert(MemVT != RegVT && "Cannot extend to the same type"); 18088 assert(MemVT.isVector() && "Must load a vector from memory"); 18089 18090 unsigned NumElems = RegVT.getVectorNumElements(); 18091 unsigned MemSz = MemVT.getSizeInBits(); 18092 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 18093 18094 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) 18095 return SDValue(); 18096 18097 // All sizes must be a power of two. 18098 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 18099 return SDValue(); 18100 18101 // Attempt to load the original value using scalar loads. 18102 // Find the largest scalar type that divides the total loaded size. 18103 MVT SclrLoadTy = MVT::i8; 18104 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18105 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18106 MVT Tp = (MVT::SimpleValueType)tp; 18107 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 18108 SclrLoadTy = Tp; 18109 } 18110 } 18111 18112 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18113 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 18114 (64 <= MemSz)) 18115 SclrLoadTy = MVT::f64; 18116 18117 // Calculate the number of scalar loads that we need to perform 18118 // in order to load our vector from memory. 18119 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 18120 if (Ext == ISD::SEXTLOAD && NumLoads > 1) 18121 return SDValue(); 18122 18123 unsigned loadRegZize = RegSz; 18124 if (Ext == ISD::SEXTLOAD && RegSz == 256) 18125 loadRegZize /= 2; 18126 18127 // Represent our vector as a sequence of elements which are the 18128 // largest scalar that we can load. 18129 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 18130 loadRegZize/SclrLoadTy.getSizeInBits()); 18131 18132 // Represent the data using the same element type that is stored in 18133 // memory. In practice, we ''widen'' MemVT. 18134 EVT WideVecVT = 18135 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18136 loadRegZize/MemVT.getScalarType().getSizeInBits()); 18137 18138 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 18139 "Invalid vector type"); 18140 18141 // We can't shuffle using an illegal type. 18142 if (!TLI.isTypeLegal(WideVecVT)) 18143 return SDValue(); 18144 18145 SmallVector<SDValue, 8> Chains; 18146 SDValue Ptr = Ld->getBasePtr(); 18147 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 18148 TLI.getPointerTy()); 18149 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 18150 18151 for (unsigned i = 0; i < NumLoads; ++i) { 18152 // Perform a single load. 18153 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 18154 Ptr, Ld->getPointerInfo(), 18155 Ld->isVolatile(), Ld->isNonTemporal(), 18156 Ld->isInvariant(), Ld->getAlignment()); 18157 Chains.push_back(ScalarLoad.getValue(1)); 18158 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 18159 // another round of DAGCombining. 18160 if (i == 0) 18161 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 18162 else 18163 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 18164 ScalarLoad, DAG.getIntPtrConstant(i)); 18165 18166 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18167 } 18168 18169 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18170 Chains.size()); 18171 18172 // Bitcast the loaded value to a vector of the original element type, in 18173 // the size of the target vector type. 18174 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 18175 unsigned SizeRatio = RegSz/MemSz; 18176 18177 if (Ext == ISD::SEXTLOAD) { 18178 // If we have SSE4.1 we can directly emit a VSEXT node. 18179 if (Subtarget->hasSSE41()) { 18180 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 18181 return DCI.CombineTo(N, Sext, TF, true); 18182 } 18183 18184 // Otherwise we'll shuffle the small elements in the high bits of the 18185 // larger type and perform an arithmetic shift. If the shift is not legal 18186 // it's better to scalarize. 18187 if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) 18188 return SDValue(); 18189 18190 // Redistribute the loaded elements into the different locations. 18191 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18192 for (unsigned i = 0; i != NumElems; ++i) 18193 ShuffleVec[i*SizeRatio + SizeRatio-1] = i; 18194 18195 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18196 DAG.getUNDEF(WideVecVT), 18197 &ShuffleVec[0]); 18198 18199 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18200 18201 // Build the arithmetic shift. 18202 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 18203 MemVT.getVectorElementType().getSizeInBits(); 18204 Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, 18205 DAG.getConstant(Amt, RegVT)); 18206 18207 return DCI.CombineTo(N, Shuff, TF, true); 18208 } 18209 18210 // Redistribute the loaded elements into the different locations. 18211 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18212 for (unsigned i = 0; i != NumElems; ++i) 18213 ShuffleVec[i*SizeRatio] = i; 18214 18215 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18216 DAG.getUNDEF(WideVecVT), 18217 &ShuffleVec[0]); 18218 18219 // Bitcast to the requested type. 18220 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18221 // Replace the original load with the new sequence 18222 // and return the new chain. 18223 return DCI.CombineTo(N, Shuff, TF, true); 18224 } 18225 18226 return SDValue(); 18227} 18228 18229/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 18230static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 18231 const X86Subtarget *Subtarget) { 18232 StoreSDNode *St = cast<StoreSDNode>(N); 18233 EVT VT = St->getValue().getValueType(); 18234 EVT StVT = St->getMemoryVT(); 18235 SDLoc dl(St); 18236 SDValue StoredVal = St->getOperand(1); 18237 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18238 18239 // If we are saving a concatenation of two XMM registers, perform two stores. 18240 // On Sandy Bridge, 256-bit memory operations are executed by two 18241 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 18242 // memory operation. 18243 unsigned Alignment = St->getAlignment(); 18244 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; 18245 if (VT.is256BitVector() && !Subtarget->hasInt256() && 18246 StVT == VT && !IsAligned) { 18247 unsigned NumElems = VT.getVectorNumElements(); 18248 if (NumElems < 2) 18249 return SDValue(); 18250 18251 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 18252 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 18253 18254 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 18255 SDValue Ptr0 = St->getBasePtr(); 18256 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 18257 18258 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 18259 St->getPointerInfo(), St->isVolatile(), 18260 St->isNonTemporal(), Alignment); 18261 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 18262 St->getPointerInfo(), St->isVolatile(), 18263 St->isNonTemporal(), 18264 std::min(16U, Alignment)); 18265 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 18266 } 18267 18268 // Optimize trunc store (of multiple scalars) to shuffle and store. 18269 // First, pack all of the elements in one place. Next, store to memory 18270 // in fewer chunks. 18271 if (St->isTruncatingStore() && VT.isVector()) { 18272 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18273 unsigned NumElems = VT.getVectorNumElements(); 18274 assert(StVT != VT && "Cannot truncate to the same type"); 18275 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 18276 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 18277 18278 // From, To sizes and ElemCount must be pow of two 18279 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 18280 // We are going to use the original vector elt for storing. 18281 // Accumulated smaller vector elements must be a multiple of the store size. 18282 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 18283 18284 unsigned SizeRatio = FromSz / ToSz; 18285 18286 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 18287 18288 // Create a type on which we perform the shuffle 18289 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 18290 StVT.getScalarType(), NumElems*SizeRatio); 18291 18292 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 18293 18294 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 18295 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18296 for (unsigned i = 0; i != NumElems; ++i) 18297 ShuffleVec[i] = i * SizeRatio; 18298 18299 // Can't shuffle using an illegal type. 18300 if (!TLI.isTypeLegal(WideVecVT)) 18301 return SDValue(); 18302 18303 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 18304 DAG.getUNDEF(WideVecVT), 18305 &ShuffleVec[0]); 18306 // At this point all of the data is stored at the bottom of the 18307 // register. We now need to save it to mem. 18308 18309 // Find the largest store unit 18310 MVT StoreType = MVT::i8; 18311 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18312 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18313 MVT Tp = (MVT::SimpleValueType)tp; 18314 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 18315 StoreType = Tp; 18316 } 18317 18318 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18319 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 18320 (64 <= NumElems * ToSz)) 18321 StoreType = MVT::f64; 18322 18323 // Bitcast the original vector into a vector of store-size units 18324 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 18325 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 18326 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 18327 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 18328 SmallVector<SDValue, 8> Chains; 18329 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 18330 TLI.getPointerTy()); 18331 SDValue Ptr = St->getBasePtr(); 18332 18333 // Perform one or more big stores into memory. 18334 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 18335 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 18336 StoreType, ShuffWide, 18337 DAG.getIntPtrConstant(i)); 18338 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 18339 St->getPointerInfo(), St->isVolatile(), 18340 St->isNonTemporal(), St->getAlignment()); 18341 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18342 Chains.push_back(Ch); 18343 } 18344 18345 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18346 Chains.size()); 18347 } 18348 18349 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 18350 // the FP state in cases where an emms may be missing. 18351 // A preferable solution to the general problem is to figure out the right 18352 // places to insert EMMS. This qualifies as a quick hack. 18353 18354 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 18355 if (VT.getSizeInBits() != 64) 18356 return SDValue(); 18357 18358 const Function *F = DAG.getMachineFunction().getFunction(); 18359 bool NoImplicitFloatOps = F->getAttributes(). 18360 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 18361 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 18362 && Subtarget->hasSSE2(); 18363 if ((VT.isVector() || 18364 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 18365 isa<LoadSDNode>(St->getValue()) && 18366 !cast<LoadSDNode>(St->getValue())->isVolatile() && 18367 St->getChain().hasOneUse() && !St->isVolatile()) { 18368 SDNode* LdVal = St->getValue().getNode(); 18369 LoadSDNode *Ld = 0; 18370 int TokenFactorIndex = -1; 18371 SmallVector<SDValue, 8> Ops; 18372 SDNode* ChainVal = St->getChain().getNode(); 18373 // Must be a store of a load. We currently handle two cases: the load 18374 // is a direct child, and it's under an intervening TokenFactor. It is 18375 // possible to dig deeper under nested TokenFactors. 18376 if (ChainVal == LdVal) 18377 Ld = cast<LoadSDNode>(St->getChain()); 18378 else if (St->getValue().hasOneUse() && 18379 ChainVal->getOpcode() == ISD::TokenFactor) { 18380 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 18381 if (ChainVal->getOperand(i).getNode() == LdVal) { 18382 TokenFactorIndex = i; 18383 Ld = cast<LoadSDNode>(St->getValue()); 18384 } else 18385 Ops.push_back(ChainVal->getOperand(i)); 18386 } 18387 } 18388 18389 if (!Ld || !ISD::isNormalLoad(Ld)) 18390 return SDValue(); 18391 18392 // If this is not the MMX case, i.e. we are just turning i64 load/store 18393 // into f64 load/store, avoid the transformation if there are multiple 18394 // uses of the loaded value. 18395 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 18396 return SDValue(); 18397 18398 SDLoc LdDL(Ld); 18399 SDLoc StDL(N); 18400 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 18401 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 18402 // pair instead. 18403 if (Subtarget->is64Bit() || F64IsLegal) { 18404 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 18405 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 18406 Ld->getPointerInfo(), Ld->isVolatile(), 18407 Ld->isNonTemporal(), Ld->isInvariant(), 18408 Ld->getAlignment()); 18409 SDValue NewChain = NewLd.getValue(1); 18410 if (TokenFactorIndex != -1) { 18411 Ops.push_back(NewChain); 18412 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18413 Ops.size()); 18414 } 18415 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 18416 St->getPointerInfo(), 18417 St->isVolatile(), St->isNonTemporal(), 18418 St->getAlignment()); 18419 } 18420 18421 // Otherwise, lower to two pairs of 32-bit loads / stores. 18422 SDValue LoAddr = Ld->getBasePtr(); 18423 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 18424 DAG.getConstant(4, MVT::i32)); 18425 18426 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 18427 Ld->getPointerInfo(), 18428 Ld->isVolatile(), Ld->isNonTemporal(), 18429 Ld->isInvariant(), Ld->getAlignment()); 18430 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 18431 Ld->getPointerInfo().getWithOffset(4), 18432 Ld->isVolatile(), Ld->isNonTemporal(), 18433 Ld->isInvariant(), 18434 MinAlign(Ld->getAlignment(), 4)); 18435 18436 SDValue NewChain = LoLd.getValue(1); 18437 if (TokenFactorIndex != -1) { 18438 Ops.push_back(LoLd); 18439 Ops.push_back(HiLd); 18440 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18441 Ops.size()); 18442 } 18443 18444 LoAddr = St->getBasePtr(); 18445 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 18446 DAG.getConstant(4, MVT::i32)); 18447 18448 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 18449 St->getPointerInfo(), 18450 St->isVolatile(), St->isNonTemporal(), 18451 St->getAlignment()); 18452 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 18453 St->getPointerInfo().getWithOffset(4), 18454 St->isVolatile(), 18455 St->isNonTemporal(), 18456 MinAlign(St->getAlignment(), 4)); 18457 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 18458 } 18459 return SDValue(); 18460} 18461 18462/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 18463/// and return the operands for the horizontal operation in LHS and RHS. A 18464/// horizontal operation performs the binary operation on successive elements 18465/// of its first operand, then on successive elements of its second operand, 18466/// returning the resulting values in a vector. For example, if 18467/// A = < float a0, float a1, float a2, float a3 > 18468/// and 18469/// B = < float b0, float b1, float b2, float b3 > 18470/// then the result of doing a horizontal operation on A and B is 18471/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 18472/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 18473/// A horizontal-op B, for some already available A and B, and if so then LHS is 18474/// set to A, RHS to B, and the routine returns 'true'. 18475/// Note that the binary operation should have the property that if one of the 18476/// operands is UNDEF then the result is UNDEF. 18477static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 18478 // Look for the following pattern: if 18479 // A = < float a0, float a1, float a2, float a3 > 18480 // B = < float b0, float b1, float b2, float b3 > 18481 // and 18482 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 18483 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 18484 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 18485 // which is A horizontal-op B. 18486 18487 // At least one of the operands should be a vector shuffle. 18488 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 18489 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 18490 return false; 18491 18492 MVT VT = LHS.getSimpleValueType(); 18493 18494 assert((VT.is128BitVector() || VT.is256BitVector()) && 18495 "Unsupported vector type for horizontal add/sub"); 18496 18497 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 18498 // operate independently on 128-bit lanes. 18499 unsigned NumElts = VT.getVectorNumElements(); 18500 unsigned NumLanes = VT.getSizeInBits()/128; 18501 unsigned NumLaneElts = NumElts / NumLanes; 18502 assert((NumLaneElts % 2 == 0) && 18503 "Vector type should have an even number of elements in each lane"); 18504 unsigned HalfLaneElts = NumLaneElts/2; 18505 18506 // View LHS in the form 18507 // LHS = VECTOR_SHUFFLE A, B, LMask 18508 // If LHS is not a shuffle then pretend it is the shuffle 18509 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 18510 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 18511 // type VT. 18512 SDValue A, B; 18513 SmallVector<int, 16> LMask(NumElts); 18514 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18515 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 18516 A = LHS.getOperand(0); 18517 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 18518 B = LHS.getOperand(1); 18519 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 18520 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 18521 } else { 18522 if (LHS.getOpcode() != ISD::UNDEF) 18523 A = LHS; 18524 for (unsigned i = 0; i != NumElts; ++i) 18525 LMask[i] = i; 18526 } 18527 18528 // Likewise, view RHS in the form 18529 // RHS = VECTOR_SHUFFLE C, D, RMask 18530 SDValue C, D; 18531 SmallVector<int, 16> RMask(NumElts); 18532 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18533 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 18534 C = RHS.getOperand(0); 18535 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 18536 D = RHS.getOperand(1); 18537 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 18538 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 18539 } else { 18540 if (RHS.getOpcode() != ISD::UNDEF) 18541 C = RHS; 18542 for (unsigned i = 0; i != NumElts; ++i) 18543 RMask[i] = i; 18544 } 18545 18546 // Check that the shuffles are both shuffling the same vectors. 18547 if (!(A == C && B == D) && !(A == D && B == C)) 18548 return false; 18549 18550 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 18551 if (!A.getNode() && !B.getNode()) 18552 return false; 18553 18554 // If A and B occur in reverse order in RHS, then "swap" them (which means 18555 // rewriting the mask). 18556 if (A != C) 18557 CommuteVectorShuffleMask(RMask, NumElts); 18558 18559 // At this point LHS and RHS are equivalent to 18560 // LHS = VECTOR_SHUFFLE A, B, LMask 18561 // RHS = VECTOR_SHUFFLE A, B, RMask 18562 // Check that the masks correspond to performing a horizontal operation. 18563 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 18564 for (unsigned i = 0; i != NumLaneElts; ++i) { 18565 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 18566 18567 // Ignore any UNDEF components. 18568 if (LIdx < 0 || RIdx < 0 || 18569 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 18570 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 18571 continue; 18572 18573 // Check that successive elements are being operated on. If not, this is 18574 // not a horizontal operation. 18575 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 18576 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 18577 if (!(LIdx == Index && RIdx == Index + 1) && 18578 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 18579 return false; 18580 } 18581 } 18582 18583 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 18584 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 18585 return true; 18586} 18587 18588/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 18589static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 18590 const X86Subtarget *Subtarget) { 18591 EVT VT = N->getValueType(0); 18592 SDValue LHS = N->getOperand(0); 18593 SDValue RHS = N->getOperand(1); 18594 18595 // Try to synthesize horizontal adds from adds of shuffles. 18596 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18597 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18598 isHorizontalBinOp(LHS, RHS, true)) 18599 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 18600 return SDValue(); 18601} 18602 18603/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 18604static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 18605 const X86Subtarget *Subtarget) { 18606 EVT VT = N->getValueType(0); 18607 SDValue LHS = N->getOperand(0); 18608 SDValue RHS = N->getOperand(1); 18609 18610 // Try to synthesize horizontal subs from subs of shuffles. 18611 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18612 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18613 isHorizontalBinOp(LHS, RHS, false)) 18614 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 18615 return SDValue(); 18616} 18617 18618/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 18619/// X86ISD::FXOR nodes. 18620static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 18621 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 18622 // F[X]OR(0.0, x) -> x 18623 // F[X]OR(x, 0.0) -> x 18624 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18625 if (C->getValueAPF().isPosZero()) 18626 return N->getOperand(1); 18627 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18628 if (C->getValueAPF().isPosZero()) 18629 return N->getOperand(0); 18630 return SDValue(); 18631} 18632 18633/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 18634/// X86ISD::FMAX nodes. 18635static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 18636 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 18637 18638 // Only perform optimizations if UnsafeMath is used. 18639 if (!DAG.getTarget().Options.UnsafeFPMath) 18640 return SDValue(); 18641 18642 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 18643 // into FMINC and FMAXC, which are Commutative operations. 18644 unsigned NewOp = 0; 18645 switch (N->getOpcode()) { 18646 default: llvm_unreachable("unknown opcode"); 18647 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 18648 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 18649 } 18650 18651 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 18652 N->getOperand(0), N->getOperand(1)); 18653} 18654 18655/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 18656static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 18657 // FAND(0.0, x) -> 0.0 18658 // FAND(x, 0.0) -> 0.0 18659 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18660 if (C->getValueAPF().isPosZero()) 18661 return N->getOperand(0); 18662 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18663 if (C->getValueAPF().isPosZero()) 18664 return N->getOperand(1); 18665 return SDValue(); 18666} 18667 18668/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes 18669static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { 18670 // FANDN(x, 0.0) -> 0.0 18671 // FANDN(0.0, x) -> x 18672 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18673 if (C->getValueAPF().isPosZero()) 18674 return N->getOperand(1); 18675 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18676 if (C->getValueAPF().isPosZero()) 18677 return N->getOperand(1); 18678 return SDValue(); 18679} 18680 18681static SDValue PerformBTCombine(SDNode *N, 18682 SelectionDAG &DAG, 18683 TargetLowering::DAGCombinerInfo &DCI) { 18684 // BT ignores high bits in the bit index operand. 18685 SDValue Op1 = N->getOperand(1); 18686 if (Op1.hasOneUse()) { 18687 unsigned BitWidth = Op1.getValueSizeInBits(); 18688 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 18689 APInt KnownZero, KnownOne; 18690 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 18691 !DCI.isBeforeLegalizeOps()); 18692 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18693 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 18694 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 18695 DCI.CommitTargetLoweringOpt(TLO); 18696 } 18697 return SDValue(); 18698} 18699 18700static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 18701 SDValue Op = N->getOperand(0); 18702 if (Op.getOpcode() == ISD::BITCAST) 18703 Op = Op.getOperand(0); 18704 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 18705 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 18706 VT.getVectorElementType().getSizeInBits() == 18707 OpVT.getVectorElementType().getSizeInBits()) { 18708 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 18709 } 18710 return SDValue(); 18711} 18712 18713static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 18714 const X86Subtarget *Subtarget) { 18715 EVT VT = N->getValueType(0); 18716 if (!VT.isVector()) 18717 return SDValue(); 18718 18719 SDValue N0 = N->getOperand(0); 18720 SDValue N1 = N->getOperand(1); 18721 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 18722 SDLoc dl(N); 18723 18724 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 18725 // both SSE and AVX2 since there is no sign-extended shift right 18726 // operation on a vector with 64-bit elements. 18727 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 18728 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 18729 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 18730 N0.getOpcode() == ISD::SIGN_EXTEND)) { 18731 SDValue N00 = N0.getOperand(0); 18732 18733 // EXTLOAD has a better solution on AVX2, 18734 // it may be replaced with X86ISD::VSEXT node. 18735 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 18736 if (!ISD::isNormalLoad(N00.getNode())) 18737 return SDValue(); 18738 18739 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 18740 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 18741 N00, N1); 18742 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 18743 } 18744 } 18745 return SDValue(); 18746} 18747 18748static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 18749 TargetLowering::DAGCombinerInfo &DCI, 18750 const X86Subtarget *Subtarget) { 18751 if (!DCI.isBeforeLegalizeOps()) 18752 return SDValue(); 18753 18754 if (!Subtarget->hasFp256()) 18755 return SDValue(); 18756 18757 EVT VT = N->getValueType(0); 18758 if (VT.isVector() && VT.getSizeInBits() == 256) { 18759 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18760 if (R.getNode()) 18761 return R; 18762 } 18763 18764 return SDValue(); 18765} 18766 18767static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 18768 const X86Subtarget* Subtarget) { 18769 SDLoc dl(N); 18770 EVT VT = N->getValueType(0); 18771 18772 // Let legalize expand this if it isn't a legal type yet. 18773 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 18774 return SDValue(); 18775 18776 EVT ScalarVT = VT.getScalarType(); 18777 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 18778 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 18779 return SDValue(); 18780 18781 SDValue A = N->getOperand(0); 18782 SDValue B = N->getOperand(1); 18783 SDValue C = N->getOperand(2); 18784 18785 bool NegA = (A.getOpcode() == ISD::FNEG); 18786 bool NegB = (B.getOpcode() == ISD::FNEG); 18787 bool NegC = (C.getOpcode() == ISD::FNEG); 18788 18789 // Negative multiplication when NegA xor NegB 18790 bool NegMul = (NegA != NegB); 18791 if (NegA) 18792 A = A.getOperand(0); 18793 if (NegB) 18794 B = B.getOperand(0); 18795 if (NegC) 18796 C = C.getOperand(0); 18797 18798 unsigned Opcode; 18799 if (!NegMul) 18800 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 18801 else 18802 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 18803 18804 return DAG.getNode(Opcode, dl, VT, A, B, C); 18805} 18806 18807static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 18808 TargetLowering::DAGCombinerInfo &DCI, 18809 const X86Subtarget *Subtarget) { 18810 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 18811 // (and (i32 x86isd::setcc_carry), 1) 18812 // This eliminates the zext. This transformation is necessary because 18813 // ISD::SETCC is always legalized to i8. 18814 SDLoc dl(N); 18815 SDValue N0 = N->getOperand(0); 18816 EVT VT = N->getValueType(0); 18817 18818 if (N0.getOpcode() == ISD::AND && 18819 N0.hasOneUse() && 18820 N0.getOperand(0).hasOneUse()) { 18821 SDValue N00 = N0.getOperand(0); 18822 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 18823 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 18824 if (!C || C->getZExtValue() != 1) 18825 return SDValue(); 18826 return DAG.getNode(ISD::AND, dl, VT, 18827 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 18828 N00.getOperand(0), N00.getOperand(1)), 18829 DAG.getConstant(1, VT)); 18830 } 18831 } 18832 18833 if (VT.is256BitVector()) { 18834 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18835 if (R.getNode()) 18836 return R; 18837 } 18838 18839 return SDValue(); 18840} 18841 18842// Optimize x == -y --> x+y == 0 18843// x != -y --> x+y != 0 18844static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 18845 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 18846 SDValue LHS = N->getOperand(0); 18847 SDValue RHS = N->getOperand(1); 18848 18849 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 18850 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 18851 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 18852 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18853 LHS.getValueType(), RHS, LHS.getOperand(1)); 18854 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18855 addV, DAG.getConstant(0, addV.getValueType()), CC); 18856 } 18857 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 18858 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 18859 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 18860 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18861 RHS.getValueType(), LHS, RHS.getOperand(1)); 18862 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18863 addV, DAG.getConstant(0, addV.getValueType()), CC); 18864 } 18865 return SDValue(); 18866} 18867 18868// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 18869// as "sbb reg,reg", since it can be extended without zext and produces 18870// an all-ones bit which is more useful than 0/1 in some cases. 18871static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { 18872 return DAG.getNode(ISD::AND, DL, MVT::i8, 18873 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 18874 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 18875 DAG.getConstant(1, MVT::i8)); 18876} 18877 18878// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 18879static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 18880 TargetLowering::DAGCombinerInfo &DCI, 18881 const X86Subtarget *Subtarget) { 18882 SDLoc DL(N); 18883 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 18884 SDValue EFLAGS = N->getOperand(1); 18885 18886 if (CC == X86::COND_A) { 18887 // Try to convert COND_A into COND_B in an attempt to facilitate 18888 // materializing "setb reg". 18889 // 18890 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 18891 // cannot take an immediate as its first operand. 18892 // 18893 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 18894 EFLAGS.getValueType().isInteger() && 18895 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 18896 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 18897 EFLAGS.getNode()->getVTList(), 18898 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 18899 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 18900 return MaterializeSETB(DL, NewEFLAGS, DAG); 18901 } 18902 } 18903 18904 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 18905 // a zext and produces an all-ones bit which is more useful than 0/1 in some 18906 // cases. 18907 if (CC == X86::COND_B) 18908 return MaterializeSETB(DL, EFLAGS, DAG); 18909 18910 SDValue Flags; 18911 18912 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18913 if (Flags.getNode()) { 18914 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18915 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 18916 } 18917 18918 return SDValue(); 18919} 18920 18921// Optimize branch condition evaluation. 18922// 18923static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 18924 TargetLowering::DAGCombinerInfo &DCI, 18925 const X86Subtarget *Subtarget) { 18926 SDLoc DL(N); 18927 SDValue Chain = N->getOperand(0); 18928 SDValue Dest = N->getOperand(1); 18929 SDValue EFLAGS = N->getOperand(3); 18930 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 18931 18932 SDValue Flags; 18933 18934 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18935 if (Flags.getNode()) { 18936 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18937 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 18938 Flags); 18939 } 18940 18941 return SDValue(); 18942} 18943 18944static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 18945 const X86TargetLowering *XTLI) { 18946 SDValue Op0 = N->getOperand(0); 18947 EVT InVT = Op0->getValueType(0); 18948 18949 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 18950 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 18951 SDLoc dl(N); 18952 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 18953 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 18954 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 18955 } 18956 18957 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 18958 // a 32-bit target where SSE doesn't support i64->FP operations. 18959 if (Op0.getOpcode() == ISD::LOAD) { 18960 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 18961 EVT VT = Ld->getValueType(0); 18962 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 18963 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 18964 !XTLI->getSubtarget()->is64Bit() && 18965 VT == MVT::i64) { 18966 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 18967 Ld->getChain(), Op0, DAG); 18968 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 18969 return FILDChain; 18970 } 18971 } 18972 return SDValue(); 18973} 18974 18975// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 18976static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 18977 X86TargetLowering::DAGCombinerInfo &DCI) { 18978 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 18979 // the result is either zero or one (depending on the input carry bit). 18980 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 18981 if (X86::isZeroNode(N->getOperand(0)) && 18982 X86::isZeroNode(N->getOperand(1)) && 18983 // We don't have a good way to replace an EFLAGS use, so only do this when 18984 // dead right now. 18985 SDValue(N, 1).use_empty()) { 18986 SDLoc DL(N); 18987 EVT VT = N->getValueType(0); 18988 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 18989 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 18990 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 18991 DAG.getConstant(X86::COND_B,MVT::i8), 18992 N->getOperand(2)), 18993 DAG.getConstant(1, VT)); 18994 return DCI.CombineTo(N, Res1, CarryOut); 18995 } 18996 18997 return SDValue(); 18998} 18999 19000// fold (add Y, (sete X, 0)) -> adc 0, Y 19001// (add Y, (setne X, 0)) -> sbb -1, Y 19002// (sub (sete X, 0), Y) -> sbb 0, Y 19003// (sub (setne X, 0), Y) -> adc -1, Y 19004static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 19005 SDLoc DL(N); 19006 19007 // Look through ZExts. 19008 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 19009 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 19010 return SDValue(); 19011 19012 SDValue SetCC = Ext.getOperand(0); 19013 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 19014 return SDValue(); 19015 19016 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 19017 if (CC != X86::COND_E && CC != X86::COND_NE) 19018 return SDValue(); 19019 19020 SDValue Cmp = SetCC.getOperand(1); 19021 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 19022 !X86::isZeroNode(Cmp.getOperand(1)) || 19023 !Cmp.getOperand(0).getValueType().isInteger()) 19024 return SDValue(); 19025 19026 SDValue CmpOp0 = Cmp.getOperand(0); 19027 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 19028 DAG.getConstant(1, CmpOp0.getValueType())); 19029 19030 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 19031 if (CC == X86::COND_NE) 19032 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 19033 DL, OtherVal.getValueType(), OtherVal, 19034 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 19035 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 19036 DL, OtherVal.getValueType(), OtherVal, 19037 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 19038} 19039 19040/// PerformADDCombine - Do target-specific dag combines on integer adds. 19041static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 19042 const X86Subtarget *Subtarget) { 19043 EVT VT = N->getValueType(0); 19044 SDValue Op0 = N->getOperand(0); 19045 SDValue Op1 = N->getOperand(1); 19046 19047 // Try to synthesize horizontal adds from adds of shuffles. 19048 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19049 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19050 isHorizontalBinOp(Op0, Op1, true)) 19051 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 19052 19053 return OptimizeConditionalInDecrement(N, DAG); 19054} 19055 19056static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 19057 const X86Subtarget *Subtarget) { 19058 SDValue Op0 = N->getOperand(0); 19059 SDValue Op1 = N->getOperand(1); 19060 19061 // X86 can't encode an immediate LHS of a sub. See if we can push the 19062 // negation into a preceding instruction. 19063 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 19064 // If the RHS of the sub is a XOR with one use and a constant, invert the 19065 // immediate. Then add one to the LHS of the sub so we can turn 19066 // X-Y -> X+~Y+1, saving one register. 19067 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 19068 isa<ConstantSDNode>(Op1.getOperand(1))) { 19069 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 19070 EVT VT = Op0.getValueType(); 19071 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 19072 Op1.getOperand(0), 19073 DAG.getConstant(~XorC, VT)); 19074 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 19075 DAG.getConstant(C->getAPIntValue()+1, VT)); 19076 } 19077 } 19078 19079 // Try to synthesize horizontal adds from adds of shuffles. 19080 EVT VT = N->getValueType(0); 19081 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19082 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19083 isHorizontalBinOp(Op0, Op1, true)) 19084 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 19085 19086 return OptimizeConditionalInDecrement(N, DAG); 19087} 19088 19089/// performVZEXTCombine - Performs build vector combines 19090static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 19091 TargetLowering::DAGCombinerInfo &DCI, 19092 const X86Subtarget *Subtarget) { 19093 // (vzext (bitcast (vzext (x)) -> (vzext x) 19094 SDValue In = N->getOperand(0); 19095 while (In.getOpcode() == ISD::BITCAST) 19096 In = In.getOperand(0); 19097 19098 if (In.getOpcode() != X86ISD::VZEXT) 19099 return SDValue(); 19100 19101 return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), 19102 In.getOperand(0)); 19103} 19104 19105SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 19106 DAGCombinerInfo &DCI) const { 19107 SelectionDAG &DAG = DCI.DAG; 19108 switch (N->getOpcode()) { 19109 default: break; 19110 case ISD::EXTRACT_VECTOR_ELT: 19111 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 19112 case ISD::VSELECT: 19113 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 19114 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 19115 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 19116 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 19117 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 19118 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 19119 case ISD::SHL: 19120 case ISD::SRA: 19121 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 19122 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 19123 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 19124 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 19125 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 19126 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 19127 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 19128 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 19129 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 19130 case X86ISD::FXOR: 19131 case X86ISD::FOR: return PerformFORCombine(N, DAG); 19132 case X86ISD::FMIN: 19133 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 19134 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 19135 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); 19136 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 19137 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 19138 case ISD::ANY_EXTEND: 19139 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 19140 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 19141 case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 19142 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 19143 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 19144 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 19145 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 19146 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 19147 case X86ISD::SHUFP: // Handle all target specific shuffles 19148 case X86ISD::PALIGNR: 19149 case X86ISD::UNPCKH: 19150 case X86ISD::UNPCKL: 19151 case X86ISD::MOVHLPS: 19152 case X86ISD::MOVLHPS: 19153 case X86ISD::PSHUFD: 19154 case X86ISD::PSHUFHW: 19155 case X86ISD::PSHUFLW: 19156 case X86ISD::MOVSS: 19157 case X86ISD::MOVSD: 19158 case X86ISD::VPERMILP: 19159 case X86ISD::VPERM2X128: 19160 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 19161 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 19162 } 19163 19164 return SDValue(); 19165} 19166 19167/// isTypeDesirableForOp - Return true if the target has native support for 19168/// the specified value type and it is 'desirable' to use the type for the 19169/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 19170/// instruction encodings are longer and some i16 instructions are slow. 19171bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 19172 if (!isTypeLegal(VT)) 19173 return false; 19174 if (VT != MVT::i16) 19175 return true; 19176 19177 switch (Opc) { 19178 default: 19179 return true; 19180 case ISD::LOAD: 19181 case ISD::SIGN_EXTEND: 19182 case ISD::ZERO_EXTEND: 19183 case ISD::ANY_EXTEND: 19184 case ISD::SHL: 19185 case ISD::SRL: 19186 case ISD::SUB: 19187 case ISD::ADD: 19188 case ISD::MUL: 19189 case ISD::AND: 19190 case ISD::OR: 19191 case ISD::XOR: 19192 return false; 19193 } 19194} 19195 19196/// IsDesirableToPromoteOp - This method query the target whether it is 19197/// beneficial for dag combiner to promote the specified node. If true, it 19198/// should return the desired promotion type by reference. 19199bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 19200 EVT VT = Op.getValueType(); 19201 if (VT != MVT::i16) 19202 return false; 19203 19204 bool Promote = false; 19205 bool Commute = false; 19206 switch (Op.getOpcode()) { 19207 default: break; 19208 case ISD::LOAD: { 19209 LoadSDNode *LD = cast<LoadSDNode>(Op); 19210 // If the non-extending load has a single use and it's not live out, then it 19211 // might be folded. 19212 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 19213 Op.hasOneUse()*/) { 19214 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 19215 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 19216 // The only case where we'd want to promote LOAD (rather then it being 19217 // promoted as an operand is when it's only use is liveout. 19218 if (UI->getOpcode() != ISD::CopyToReg) 19219 return false; 19220 } 19221 } 19222 Promote = true; 19223 break; 19224 } 19225 case ISD::SIGN_EXTEND: 19226 case ISD::ZERO_EXTEND: 19227 case ISD::ANY_EXTEND: 19228 Promote = true; 19229 break; 19230 case ISD::SHL: 19231 case ISD::SRL: { 19232 SDValue N0 = Op.getOperand(0); 19233 // Look out for (store (shl (load), x)). 19234 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 19235 return false; 19236 Promote = true; 19237 break; 19238 } 19239 case ISD::ADD: 19240 case ISD::MUL: 19241 case ISD::AND: 19242 case ISD::OR: 19243 case ISD::XOR: 19244 Commute = true; 19245 // fallthrough 19246 case ISD::SUB: { 19247 SDValue N0 = Op.getOperand(0); 19248 SDValue N1 = Op.getOperand(1); 19249 if (!Commute && MayFoldLoad(N1)) 19250 return false; 19251 // Avoid disabling potential load folding opportunities. 19252 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 19253 return false; 19254 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 19255 return false; 19256 Promote = true; 19257 } 19258 } 19259 19260 PVT = MVT::i32; 19261 return Promote; 19262} 19263 19264//===----------------------------------------------------------------------===// 19265// X86 Inline Assembly Support 19266//===----------------------------------------------------------------------===// 19267 19268namespace { 19269 // Helper to match a string separated by whitespace. 19270 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 19271 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 19272 19273 for (unsigned i = 0, e = args.size(); i != e; ++i) { 19274 StringRef piece(*args[i]); 19275 if (!s.startswith(piece)) // Check if the piece matches. 19276 return false; 19277 19278 s = s.substr(piece.size()); 19279 StringRef::size_type pos = s.find_first_not_of(" \t"); 19280 if (pos == 0) // We matched a prefix. 19281 return false; 19282 19283 s = s.substr(pos); 19284 } 19285 19286 return s.empty(); 19287 } 19288 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 19289} 19290 19291static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 19292 19293 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 19294 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 19295 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 19296 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 19297 19298 if (AsmPieces.size() == 3) 19299 return true; 19300 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 19301 return true; 19302 } 19303 } 19304 return false; 19305} 19306 19307bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 19308 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 19309 19310 std::string AsmStr = IA->getAsmString(); 19311 19312 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 19313 if (!Ty || Ty->getBitWidth() % 16 != 0) 19314 return false; 19315 19316 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 19317 SmallVector<StringRef, 4> AsmPieces; 19318 SplitString(AsmStr, AsmPieces, ";\n"); 19319 19320 switch (AsmPieces.size()) { 19321 default: return false; 19322 case 1: 19323 // FIXME: this should verify that we are targeting a 486 or better. If not, 19324 // we will turn this bswap into something that will be lowered to logical 19325 // ops instead of emitting the bswap asm. For now, we don't support 486 or 19326 // lower so don't worry about this. 19327 // bswap $0 19328 if (matchAsm(AsmPieces[0], "bswap", "$0") || 19329 matchAsm(AsmPieces[0], "bswapl", "$0") || 19330 matchAsm(AsmPieces[0], "bswapq", "$0") || 19331 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 19332 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 19333 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 19334 // No need to check constraints, nothing other than the equivalent of 19335 // "=r,0" would be valid here. 19336 return IntrinsicLowering::LowerToByteSwap(CI); 19337 } 19338 19339 // rorw $$8, ${0:w} --> llvm.bswap.i16 19340 if (CI->getType()->isIntegerTy(16) && 19341 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19342 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 19343 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 19344 AsmPieces.clear(); 19345 const std::string &ConstraintsStr = IA->getConstraintString(); 19346 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19347 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19348 if (clobbersFlagRegisters(AsmPieces)) 19349 return IntrinsicLowering::LowerToByteSwap(CI); 19350 } 19351 break; 19352 case 3: 19353 if (CI->getType()->isIntegerTy(32) && 19354 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19355 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 19356 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 19357 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 19358 AsmPieces.clear(); 19359 const std::string &ConstraintsStr = IA->getConstraintString(); 19360 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19361 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19362 if (clobbersFlagRegisters(AsmPieces)) 19363 return IntrinsicLowering::LowerToByteSwap(CI); 19364 } 19365 19366 if (CI->getType()->isIntegerTy(64)) { 19367 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 19368 if (Constraints.size() >= 2 && 19369 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 19370 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 19371 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 19372 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 19373 matchAsm(AsmPieces[1], "bswap", "%edx") && 19374 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 19375 return IntrinsicLowering::LowerToByteSwap(CI); 19376 } 19377 } 19378 break; 19379 } 19380 return false; 19381} 19382 19383/// getConstraintType - Given a constraint letter, return the type of 19384/// constraint it is for this target. 19385X86TargetLowering::ConstraintType 19386X86TargetLowering::getConstraintType(const std::string &Constraint) const { 19387 if (Constraint.size() == 1) { 19388 switch (Constraint[0]) { 19389 case 'R': 19390 case 'q': 19391 case 'Q': 19392 case 'f': 19393 case 't': 19394 case 'u': 19395 case 'y': 19396 case 'x': 19397 case 'Y': 19398 case 'l': 19399 return C_RegisterClass; 19400 case 'a': 19401 case 'b': 19402 case 'c': 19403 case 'd': 19404 case 'S': 19405 case 'D': 19406 case 'A': 19407 return C_Register; 19408 case 'I': 19409 case 'J': 19410 case 'K': 19411 case 'L': 19412 case 'M': 19413 case 'N': 19414 case 'G': 19415 case 'C': 19416 case 'e': 19417 case 'Z': 19418 return C_Other; 19419 default: 19420 break; 19421 } 19422 } 19423 return TargetLowering::getConstraintType(Constraint); 19424} 19425 19426/// Examine constraint type and operand type and determine a weight value. 19427/// This object must already have been set up with the operand type 19428/// and the current alternative constraint selected. 19429TargetLowering::ConstraintWeight 19430 X86TargetLowering::getSingleConstraintMatchWeight( 19431 AsmOperandInfo &info, const char *constraint) const { 19432 ConstraintWeight weight = CW_Invalid; 19433 Value *CallOperandVal = info.CallOperandVal; 19434 // If we don't have a value, we can't do a match, 19435 // but allow it at the lowest weight. 19436 if (CallOperandVal == NULL) 19437 return CW_Default; 19438 Type *type = CallOperandVal->getType(); 19439 // Look at the constraint type. 19440 switch (*constraint) { 19441 default: 19442 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 19443 case 'R': 19444 case 'q': 19445 case 'Q': 19446 case 'a': 19447 case 'b': 19448 case 'c': 19449 case 'd': 19450 case 'S': 19451 case 'D': 19452 case 'A': 19453 if (CallOperandVal->getType()->isIntegerTy()) 19454 weight = CW_SpecificReg; 19455 break; 19456 case 'f': 19457 case 't': 19458 case 'u': 19459 if (type->isFloatingPointTy()) 19460 weight = CW_SpecificReg; 19461 break; 19462 case 'y': 19463 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 19464 weight = CW_SpecificReg; 19465 break; 19466 case 'x': 19467 case 'Y': 19468 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 19469 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 19470 weight = CW_Register; 19471 break; 19472 case 'I': 19473 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 19474 if (C->getZExtValue() <= 31) 19475 weight = CW_Constant; 19476 } 19477 break; 19478 case 'J': 19479 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19480 if (C->getZExtValue() <= 63) 19481 weight = CW_Constant; 19482 } 19483 break; 19484 case 'K': 19485 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19486 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 19487 weight = CW_Constant; 19488 } 19489 break; 19490 case 'L': 19491 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19492 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 19493 weight = CW_Constant; 19494 } 19495 break; 19496 case 'M': 19497 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19498 if (C->getZExtValue() <= 3) 19499 weight = CW_Constant; 19500 } 19501 break; 19502 case 'N': 19503 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19504 if (C->getZExtValue() <= 0xff) 19505 weight = CW_Constant; 19506 } 19507 break; 19508 case 'G': 19509 case 'C': 19510 if (dyn_cast<ConstantFP>(CallOperandVal)) { 19511 weight = CW_Constant; 19512 } 19513 break; 19514 case 'e': 19515 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19516 if ((C->getSExtValue() >= -0x80000000LL) && 19517 (C->getSExtValue() <= 0x7fffffffLL)) 19518 weight = CW_Constant; 19519 } 19520 break; 19521 case 'Z': 19522 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19523 if (C->getZExtValue() <= 0xffffffff) 19524 weight = CW_Constant; 19525 } 19526 break; 19527 } 19528 return weight; 19529} 19530 19531/// LowerXConstraint - try to replace an X constraint, which matches anything, 19532/// with another that has more specific requirements based on the type of the 19533/// corresponding operand. 19534const char *X86TargetLowering:: 19535LowerXConstraint(EVT ConstraintVT) const { 19536 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 19537 // 'f' like normal targets. 19538 if (ConstraintVT.isFloatingPoint()) { 19539 if (Subtarget->hasSSE2()) 19540 return "Y"; 19541 if (Subtarget->hasSSE1()) 19542 return "x"; 19543 } 19544 19545 return TargetLowering::LowerXConstraint(ConstraintVT); 19546} 19547 19548/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 19549/// vector. If it is invalid, don't add anything to Ops. 19550void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 19551 std::string &Constraint, 19552 std::vector<SDValue>&Ops, 19553 SelectionDAG &DAG) const { 19554 SDValue Result(0, 0); 19555 19556 // Only support length 1 constraints for now. 19557 if (Constraint.length() > 1) return; 19558 19559 char ConstraintLetter = Constraint[0]; 19560 switch (ConstraintLetter) { 19561 default: break; 19562 case 'I': 19563 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19564 if (C->getZExtValue() <= 31) { 19565 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19566 break; 19567 } 19568 } 19569 return; 19570 case 'J': 19571 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19572 if (C->getZExtValue() <= 63) { 19573 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19574 break; 19575 } 19576 } 19577 return; 19578 case 'K': 19579 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19580 if (isInt<8>(C->getSExtValue())) { 19581 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19582 break; 19583 } 19584 } 19585 return; 19586 case 'N': 19587 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19588 if (C->getZExtValue() <= 255) { 19589 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19590 break; 19591 } 19592 } 19593 return; 19594 case 'e': { 19595 // 32-bit signed value 19596 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19597 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19598 C->getSExtValue())) { 19599 // Widen to 64 bits here to get it sign extended. 19600 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 19601 break; 19602 } 19603 // FIXME gcc accepts some relocatable values here too, but only in certain 19604 // memory models; it's complicated. 19605 } 19606 return; 19607 } 19608 case 'Z': { 19609 // 32-bit unsigned value 19610 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19611 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19612 C->getZExtValue())) { 19613 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19614 break; 19615 } 19616 } 19617 // FIXME gcc accepts some relocatable values here too, but only in certain 19618 // memory models; it's complicated. 19619 return; 19620 } 19621 case 'i': { 19622 // Literal immediates are always ok. 19623 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 19624 // Widen to 64 bits here to get it sign extended. 19625 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 19626 break; 19627 } 19628 19629 // In any sort of PIC mode addresses need to be computed at runtime by 19630 // adding in a register or some sort of table lookup. These can't 19631 // be used as immediates. 19632 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 19633 return; 19634 19635 // If we are in non-pic codegen mode, we allow the address of a global (with 19636 // an optional displacement) to be used with 'i'. 19637 GlobalAddressSDNode *GA = 0; 19638 int64_t Offset = 0; 19639 19640 // Match either (GA), (GA+C), (GA+C1+C2), etc. 19641 while (1) { 19642 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 19643 Offset += GA->getOffset(); 19644 break; 19645 } else if (Op.getOpcode() == ISD::ADD) { 19646 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19647 Offset += C->getZExtValue(); 19648 Op = Op.getOperand(0); 19649 continue; 19650 } 19651 } else if (Op.getOpcode() == ISD::SUB) { 19652 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19653 Offset += -C->getZExtValue(); 19654 Op = Op.getOperand(0); 19655 continue; 19656 } 19657 } 19658 19659 // Otherwise, this isn't something we can handle, reject it. 19660 return; 19661 } 19662 19663 const GlobalValue *GV = GA->getGlobal(); 19664 // If we require an extra load to get this address, as in PIC mode, we 19665 // can't accept it. 19666 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 19667 getTargetMachine()))) 19668 return; 19669 19670 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 19671 GA->getValueType(0), Offset); 19672 break; 19673 } 19674 } 19675 19676 if (Result.getNode()) { 19677 Ops.push_back(Result); 19678 return; 19679 } 19680 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 19681} 19682 19683std::pair<unsigned, const TargetRegisterClass*> 19684X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 19685 MVT VT) const { 19686 // First, see if this is a constraint that directly corresponds to an LLVM 19687 // register class. 19688 if (Constraint.size() == 1) { 19689 // GCC Constraint Letters 19690 switch (Constraint[0]) { 19691 default: break; 19692 // TODO: Slight differences here in allocation order and leaving 19693 // RIP in the class. Do they matter any more here than they do 19694 // in the normal allocation? 19695 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 19696 if (Subtarget->is64Bit()) { 19697 if (VT == MVT::i32 || VT == MVT::f32) 19698 return std::make_pair(0U, &X86::GR32RegClass); 19699 if (VT == MVT::i16) 19700 return std::make_pair(0U, &X86::GR16RegClass); 19701 if (VT == MVT::i8 || VT == MVT::i1) 19702 return std::make_pair(0U, &X86::GR8RegClass); 19703 if (VT == MVT::i64 || VT == MVT::f64) 19704 return std::make_pair(0U, &X86::GR64RegClass); 19705 break; 19706 } 19707 // 32-bit fallthrough 19708 case 'Q': // Q_REGS 19709 if (VT == MVT::i32 || VT == MVT::f32) 19710 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 19711 if (VT == MVT::i16) 19712 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 19713 if (VT == MVT::i8 || VT == MVT::i1) 19714 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 19715 if (VT == MVT::i64) 19716 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 19717 break; 19718 case 'r': // GENERAL_REGS 19719 case 'l': // INDEX_REGS 19720 if (VT == MVT::i8 || VT == MVT::i1) 19721 return std::make_pair(0U, &X86::GR8RegClass); 19722 if (VT == MVT::i16) 19723 return std::make_pair(0U, &X86::GR16RegClass); 19724 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 19725 return std::make_pair(0U, &X86::GR32RegClass); 19726 return std::make_pair(0U, &X86::GR64RegClass); 19727 case 'R': // LEGACY_REGS 19728 if (VT == MVT::i8 || VT == MVT::i1) 19729 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 19730 if (VT == MVT::i16) 19731 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 19732 if (VT == MVT::i32 || !Subtarget->is64Bit()) 19733 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 19734 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 19735 case 'f': // FP Stack registers. 19736 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 19737 // value to the correct fpstack register class. 19738 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 19739 return std::make_pair(0U, &X86::RFP32RegClass); 19740 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 19741 return std::make_pair(0U, &X86::RFP64RegClass); 19742 return std::make_pair(0U, &X86::RFP80RegClass); 19743 case 'y': // MMX_REGS if MMX allowed. 19744 if (!Subtarget->hasMMX()) break; 19745 return std::make_pair(0U, &X86::VR64RegClass); 19746 case 'Y': // SSE_REGS if SSE2 allowed 19747 if (!Subtarget->hasSSE2()) break; 19748 // FALL THROUGH. 19749 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 19750 if (!Subtarget->hasSSE1()) break; 19751 19752 switch (VT.SimpleTy) { 19753 default: break; 19754 // Scalar SSE types. 19755 case MVT::f32: 19756 case MVT::i32: 19757 return std::make_pair(0U, &X86::FR32RegClass); 19758 case MVT::f64: 19759 case MVT::i64: 19760 return std::make_pair(0U, &X86::FR64RegClass); 19761 // Vector types. 19762 case MVT::v16i8: 19763 case MVT::v8i16: 19764 case MVT::v4i32: 19765 case MVT::v2i64: 19766 case MVT::v4f32: 19767 case MVT::v2f64: 19768 return std::make_pair(0U, &X86::VR128RegClass); 19769 // AVX types. 19770 case MVT::v32i8: 19771 case MVT::v16i16: 19772 case MVT::v8i32: 19773 case MVT::v4i64: 19774 case MVT::v8f32: 19775 case MVT::v4f64: 19776 return std::make_pair(0U, &X86::VR256RegClass); 19777 case MVT::v8f64: 19778 case MVT::v16f32: 19779 case MVT::v16i32: 19780 case MVT::v8i64: 19781 return std::make_pair(0U, &X86::VR512RegClass); 19782 } 19783 break; 19784 } 19785 } 19786 19787 // Use the default implementation in TargetLowering to convert the register 19788 // constraint into a member of a register class. 19789 std::pair<unsigned, const TargetRegisterClass*> Res; 19790 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 19791 19792 // Not found as a standard register? 19793 if (Res.second == 0) { 19794 // Map st(0) -> st(7) -> ST0 19795 if (Constraint.size() == 7 && Constraint[0] == '{' && 19796 tolower(Constraint[1]) == 's' && 19797 tolower(Constraint[2]) == 't' && 19798 Constraint[3] == '(' && 19799 (Constraint[4] >= '0' && Constraint[4] <= '7') && 19800 Constraint[5] == ')' && 19801 Constraint[6] == '}') { 19802 19803 Res.first = X86::ST0+Constraint[4]-'0'; 19804 Res.second = &X86::RFP80RegClass; 19805 return Res; 19806 } 19807 19808 // GCC allows "st(0)" to be called just plain "st". 19809 if (StringRef("{st}").equals_lower(Constraint)) { 19810 Res.first = X86::ST0; 19811 Res.second = &X86::RFP80RegClass; 19812 return Res; 19813 } 19814 19815 // flags -> EFLAGS 19816 if (StringRef("{flags}").equals_lower(Constraint)) { 19817 Res.first = X86::EFLAGS; 19818 Res.second = &X86::CCRRegClass; 19819 return Res; 19820 } 19821 19822 // 'A' means EAX + EDX. 19823 if (Constraint == "A") { 19824 Res.first = X86::EAX; 19825 Res.second = &X86::GR32_ADRegClass; 19826 return Res; 19827 } 19828 return Res; 19829 } 19830 19831 // Otherwise, check to see if this is a register class of the wrong value 19832 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 19833 // turn into {ax},{dx}. 19834 if (Res.second->hasType(VT)) 19835 return Res; // Correct type already, nothing to do. 19836 19837 // All of the single-register GCC register classes map their values onto 19838 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 19839 // really want an 8-bit or 32-bit register, map to the appropriate register 19840 // class and return the appropriate register. 19841 if (Res.second == &X86::GR16RegClass) { 19842 if (VT == MVT::i8 || VT == MVT::i1) { 19843 unsigned DestReg = 0; 19844 switch (Res.first) { 19845 default: break; 19846 case X86::AX: DestReg = X86::AL; break; 19847 case X86::DX: DestReg = X86::DL; break; 19848 case X86::CX: DestReg = X86::CL; break; 19849 case X86::BX: DestReg = X86::BL; break; 19850 } 19851 if (DestReg) { 19852 Res.first = DestReg; 19853 Res.second = &X86::GR8RegClass; 19854 } 19855 } else if (VT == MVT::i32 || VT == MVT::f32) { 19856 unsigned DestReg = 0; 19857 switch (Res.first) { 19858 default: break; 19859 case X86::AX: DestReg = X86::EAX; break; 19860 case X86::DX: DestReg = X86::EDX; break; 19861 case X86::CX: DestReg = X86::ECX; break; 19862 case X86::BX: DestReg = X86::EBX; break; 19863 case X86::SI: DestReg = X86::ESI; break; 19864 case X86::DI: DestReg = X86::EDI; break; 19865 case X86::BP: DestReg = X86::EBP; break; 19866 case X86::SP: DestReg = X86::ESP; break; 19867 } 19868 if (DestReg) { 19869 Res.first = DestReg; 19870 Res.second = &X86::GR32RegClass; 19871 } 19872 } else if (VT == MVT::i64 || VT == MVT::f64) { 19873 unsigned DestReg = 0; 19874 switch (Res.first) { 19875 default: break; 19876 case X86::AX: DestReg = X86::RAX; break; 19877 case X86::DX: DestReg = X86::RDX; break; 19878 case X86::CX: DestReg = X86::RCX; break; 19879 case X86::BX: DestReg = X86::RBX; break; 19880 case X86::SI: DestReg = X86::RSI; break; 19881 case X86::DI: DestReg = X86::RDI; break; 19882 case X86::BP: DestReg = X86::RBP; break; 19883 case X86::SP: DestReg = X86::RSP; break; 19884 } 19885 if (DestReg) { 19886 Res.first = DestReg; 19887 Res.second = &X86::GR64RegClass; 19888 } 19889 } 19890 } else if (Res.second == &X86::FR32RegClass || 19891 Res.second == &X86::FR64RegClass || 19892 Res.second == &X86::VR128RegClass || 19893 Res.second == &X86::VR256RegClass || 19894 Res.second == &X86::FR32XRegClass || 19895 Res.second == &X86::FR64XRegClass || 19896 Res.second == &X86::VR128XRegClass || 19897 Res.second == &X86::VR256XRegClass || 19898 Res.second == &X86::VR512RegClass) { 19899 // Handle references to XMM physical registers that got mapped into the 19900 // wrong class. This can happen with constraints like {xmm0} where the 19901 // target independent register mapper will just pick the first match it can 19902 // find, ignoring the required type. 19903 19904 if (VT == MVT::f32 || VT == MVT::i32) 19905 Res.second = &X86::FR32RegClass; 19906 else if (VT == MVT::f64 || VT == MVT::i64) 19907 Res.second = &X86::FR64RegClass; 19908 else if (X86::VR128RegClass.hasType(VT)) 19909 Res.second = &X86::VR128RegClass; 19910 else if (X86::VR256RegClass.hasType(VT)) 19911 Res.second = &X86::VR256RegClass; 19912 else if (X86::VR512RegClass.hasType(VT)) 19913 Res.second = &X86::VR512RegClass; 19914 } 19915 19916 return Res; 19917} 19918